recoverd: Track failure of "recovered" event, banning culprits
authorMartin Schwenke <martin@meltin.net>
Mon, 24 Sep 2012 04:32:04 +0000 (14:32 +1000)
committerMartin Schwenke <martin@meltin.net>
Thu, 11 Oct 2012 01:10:45 +0000 (12:10 +1100)
Pair-programmed-with: Amitay Isaacs <amitay@gmail.com>
Signed-off-by: Martin Schwenke <martin@meltin.net>
server/ctdb_recoverd.c

index 1153a40c704c2c3f72413f796c81cd76cadbaba2..55d878bdd1767c874a495523d734755d8cc6860c 100644 (file)
@@ -107,33 +107,6 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_
 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 
 
-/*
-  run the "recovered" eventscript on all nodes
- */
-static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
-{
-       TALLOC_CTX *tmp_ctx;
-       uint32_t *nodes;
-
-       tmp_ctx = talloc_new(ctdb);
-       CTDB_NO_MEMORY(ctdb, tmp_ctx);
-
-       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
-       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
-                                       nodes, 0,
-                                       CONTROL_TIMEOUT(), false, tdb_null,
-                                       NULL, NULL,
-                                       NULL) != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
-
-               talloc_free(tmp_ctx);
-               return -1;
-       }
-
-       talloc_free(tmp_ctx);
-       return 0;
-}
-
 /*
   remember the trouble maker
  */
@@ -175,6 +148,46 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 }
 
 
+/* this callback is called for every node that failed to execute the
+   recovered event
+*/
+static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
+
+       ctdb_set_culprit(rec, node_pnn);
+}
+
+/*
+  run the "recovered" eventscript on all nodes
+ */
+static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
+{
+       TALLOC_CTX *tmp_ctx;
+       uint32_t *nodes;
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, tdb_null,
+                                       NULL, recovered_fail_callback,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
+
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
 /* this callback is called for every node that failed to execute the
    start recovery event
 */
@@ -1775,7 +1788,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
        }
 
        /* execute the "recovered" event script on all nodes */
-       ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
+       ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
        if (ret!=0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
                return -1;
@@ -3726,7 +3739,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                }
 
                /* execute the "recovered" event script on all nodes */
-               ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
+               ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
 #if 0
 // we cant check whether the event completed successfully
 // since this script WILL fail if the node is in recovery mode