traverse: Send traverse end record from traverse child process
authorAmitay Isaacs <amitay@gmail.com>
Mon, 9 Sep 2013 02:46:26 +0000 (12:46 +1000)
committerAmitay Isaacs <amitay@gmail.com>
Wed, 25 Sep 2013 04:59:45 +0000 (14:59 +1000)
Traverse records are sent directly from traverse child process, but
the last empty record signalling end of traverse is sent from ctdbd.
This creates a race condition between ctdbd and traverse child.
There are two fds from traverse child to ctdbd - a pipe to track status
of the child process and unix socket connection for sending records.
It's possible that last few records are sitting in unix socket buffer
when ctdbd reads the status written from traverse child.  This will
be interpreted as end of traverse and ctdbd will send the last empty
record to originating node before it has processed the pending packets
in unix socket connection.

The race is avoided by sending the last empty record marking end of
traverse from the child process.

Signed-off-by: Amitay Isaacs <amitay@gmail.com>
server/ctdb_traverse.c

index 5b588c6a2257d29656ac6f29f4c57ccf11a20b61..99e7e8f3e0dd76cf9f01949a036619fbc8492352 100644 (file)
@@ -195,9 +195,11 @@ static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_con
 
        if (h->child == 0) {
                /* start the traverse in the child */
-               int res;
+               int res, status;
                pid_t parent = getpid();
                struct ctdb_context *ctdb = ctdb_db->ctdb;
+               struct ctdb_rec_data *d;
+               TDB_DATA outdata;
 
                close(h->fd[0]);
 
@@ -208,6 +210,13 @@ static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_con
                        _exit(0);
                }
 
+               d = ctdb_marshall_record(h, h->reqid, tdb_null, NULL, tdb_null);
+               if (d == NULL) {
+                       res = 0;
+                       write(h->fd[1], &res, sizeof(int));
+                       _exit(0);
+               }
+
                res = tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h);
                if (res == -1 || h->records_failed > 0) {
                        /* traverse failed */
@@ -221,6 +230,19 @@ static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_con
                        tevent_loop_once(ctdb->ev);
                }
 
+               /* End traverse by sending empty record */
+               outdata.dptr = (uint8_t *)d;
+               outdata.dsize = d->length;
+               ret = ctdb_control(ctdb, h->srcnode, 0,
+                                  CTDB_CONTROL_TRAVERSE_DATA,
+                                  CTDB_CTRL_FLAG_NOREPLY, outdata,
+                                  NULL, NULL, &status, NULL, NULL);
+               if (ret == -1 || status == -1) {
+                       if (res > 0) {
+                               res = -res;
+                       }
+               }
+
                write(h->fd[1], &res, sizeof(res));
 
                while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
@@ -422,35 +444,14 @@ static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_
 }
 
 /*
-  called for each record during a traverse all 
+  called when local traverse ends
  */
 static void traverse_all_callback(void *p, TDB_DATA key, TDB_DATA data)
 {
        struct traverse_all_state *state = talloc_get_type(p, struct traverse_all_state);
-       int ret;
-       struct ctdb_rec_data *d;
-       TDB_DATA cdata;
 
-       d = ctdb_marshall_record(state, state->reqid, key, NULL, data);
-       if (d == NULL) {
-               /* darn .... */
-               DEBUG(DEBUG_ERR,("Out of memory in traverse_all_callback\n"));
-               return;
-       }
-
-       cdata.dptr = (uint8_t *)d;
-       cdata.dsize = d->length;
-
-       ret = ctdb_daemon_send_control(state->ctdb, state->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA,
-                                      0, CTDB_CTRL_FLAG_NOREPLY, cdata, NULL, NULL);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send traverse data\n"));
-       }
-
-       if (key.dsize == 0 && data.dsize == 0) {
-               /* we're done */
-               talloc_free(state);
-       }
+       /* we're done */
+       talloc_free(state);
 }
 
 /*