recovery: add special pull-logic for persistent databases
authorMichael Adam <obnox@samba.org>
Fri, 4 Dec 2009 10:21:29 +0000 (11:21 +0100)
committerMichael Adam <obnox@samba.org>
Fri, 4 Dec 2009 14:00:21 +0000 (15:00 +0100)
The decision mechanism which records of a persistent db
are to be pulled into the recdb during recovery is now
as follows:

* Usually a record with the higher rsn than that already
  stored is taken. (Just as for normal tdbs.)

* If a transaction is running on some node, then those
  nodes copies of all records are taken and are not
  overwritten later by other nodes' copies.

In order to keep track of whether a record's copy was obtained
from a node with a transaction running, the recovery mechanism
misuses the ctdb tdb header field 'lacount' in the recdb.
It is cleared later when pushing out the recdb database to the
other nodes.

This way, an incomplete transaction is not spoiled when
a recovery interrupts and the replay should usually succeed
(possibly after a few retries).

Michael

server/ctdb_recoverd.c

index 3e596da9ecad9d88c8aa99184e4c94dac10a7720..071c0a3da9460191aacd487e44cc6d1463d2377c 100644 (file)
@@ -529,6 +529,7 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
        struct ctdb_marshall_buffer *reply;
        struct ctdb_rec_data *rec;
        int i;
+       int32_t transaction_active = 0;
        TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 
        ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
@@ -548,6 +549,18 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
        }
        
        rec = (struct ctdb_rec_data *)&reply->data[0];
+
+       if (persistent) {
+               transaction_active = ctdb_ctrl_transaction_active(ctdb, srcnode,
+                                                                 dbid);
+               if (transaction_active == -1) {
+                       DEBUG(DEBUG_ERR, (__location__ " error calling "
+                                         "ctdb_ctrl_transaction_active to node"
+                                         " %u\n", srcnode));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+       }
        
        for (i=0;
             i<reply->count;
@@ -583,12 +596,42 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
                        }
                        header = *(struct ctdb_ltdb_header *)existing.dptr;
                        free(existing.dptr);
-                       if (!(header.rsn < hdr->rsn ||
-                             (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
-                               continue;
+                       if (!persistent) {
+                               if (!(header.rsn < hdr->rsn ||
+                                   (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn)))
+                               {
+                                       continue;
+                               }
+                       } else {
+                               if (header.lacount == (uint32_t)-1) {
+                                       /*
+                                        * skip record if the stored copy came
+                                        * from a node with active transaction
+                                        */
+                                       continue;
+                               }
+
+                               if ((header.rsn >= hdr->rsn) &&
+                                   !transaction_active)
+                               {
+                                       continue;
+                               }
                        }
                }
-               
+
+               if (persistent) {
+                       /*
+                        * Misuse the lacount field to signal
+                        * that we got the record from a node
+                        * that has a transaction running.
+                        */
+                       if (transaction_active) {
+                               hdr->lacount = (uint32_t)-1;
+                       } else {
+                               hdr->lacount = 0;
+                       }
+               }
+
                if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
                        DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
                        talloc_free(tmp_ctx);
@@ -1059,6 +1102,13 @@ static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
        hdr = (struct ctdb_ltdb_header *)data.dptr;
        if (!params->persistent) {
                hdr->dmaster = params->ctdb->pnn;
+       } else {
+               /*
+                * Clear the lacount field that had been misused
+                * when pulling the db in order to keep track of
+                * whether the node had a transaction running.
+                */
+               hdr->lacount = 0;
        }
 
        /* add the record to the blob ready to send to the nodes */