recoverd: Call takeover fail callback only once per node
authorMartin Schwenke <martin@meltin.net>
Mon, 22 Jul 2013 06:39:46 +0000 (16:39 +1000)
committerAmitay Isaacs <amitay@gmail.com>
Mon, 29 Jul 2013 05:48:48 +0000 (15:48 +1000)
Currently the fail callback is called once per (takeip/releaseip) control
failure.  This is overkill and can get a node banned much too quickly.

Instead, keep track of control failures per node and only call fail
callback once per failed node.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Pair-programmed-with: Amitay Isaacs <amitay@gmail.com>

server/ctdb_takeover.c

index be49b3f3b7f2b9e90747df741196d0fb6854f29e..9a2f8f020e2c0f1f869b70b54871f3707a1a24d3 100644 (file)
@@ -2622,6 +2622,40 @@ static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
        }
 }
 
+struct takeover_callback_data {
+       bool *node_failed;
+       client_async_callback fail_callback;
+       void *fail_callback_data;
+       struct ctdb_node_map *nodemap;
+};
+
+static void takeover_run_fail_callback(struct ctdb_context *ctdb,
+                                      uint32_t node_pnn, int32_t res,
+                                      TDB_DATA outdata, void *callback_data)
+{
+       struct takeover_callback_data *cd =
+               talloc_get_type_abort(callback_data,
+                                     struct takeover_callback_data);
+       int i;
+
+       for (i = 0; i < cd->nodemap->num; i++) {
+               if (node_pnn == cd->nodemap->nodes[i].pnn) {
+                       break;
+               }
+       }
+
+       if (i == cd->nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
+               return;
+       }
+
+       if (!cd->node_failed[i]) {
+               cd->node_failed[i] = true;
+               cd->fail_callback(ctdb, node_pnn, res, outdata,
+                                 cd->fail_callback_data);
+       }
+}
+
 /*
   make any IP alias changes for public addresses that are necessary 
  */
@@ -2640,6 +2674,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
        uint32_t disable_timeout;
        struct ctdb_ipflags *ipflags;
+       struct takeover_callback_data *takeover_data;
        struct iprealloc_callback_data iprealloc_data;
        bool *retry_data;
 
@@ -2683,11 +2718,21 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
        /* now tell all nodes to delete any alias that they should not
           have.  This will be a NOOP on nodes that don't currently
           hold the given alias */
+       takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
+
+       takeover_data->node_failed = talloc_zero_array(tmp_ctx,
+                                                      bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
+       takeover_data->fail_callback = fail_callback;
+       takeover_data->fail_callback_data = callback_data;
+       takeover_data->nodemap = nodemap;
+
        async_data = talloc_zero(tmp_ctx, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 
-       async_data->fail_callback = fail_callback;
-       async_data->callback_data = callback_data;
+       async_data->fail_callback = takeover_run_fail_callback;
+       async_data->callback_data = takeover_data;
 
        for (i=0;i<nodemap->num;i++) {
                /* don't talk to unconnected nodes, but do talk to banned nodes */