ctdb-recoverd: Store recovery lock handle
[vlendec/samba-autobuild/.git] / ctdb / server / ctdb_recoverd.c
index 3e85186f35aa020dba85453eae7d3463ba9efad8..46357b6ed690905ae2fe05f39b566397f473495f 100644 (file)
@@ -239,6 +239,8 @@ struct ctdb_banning_state {
        struct timeval last_reported_time;
 };
 
+struct ctdb_recovery_lock_handle;
+
 /*
   private state of recovery daemon
  */
@@ -260,7 +262,7 @@ struct ctdb_recoverd {
        uint32_t *force_rebalance_nodes;
        struct ctdb_node_capabilities *caps;
        bool frozen_on_inactive;
-       struct ctdb_cluster_mutex_handle *recovery_lock_handle;
+       struct ctdb_recovery_lock_handle *recovery_lock_handle;
 };
 
 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@@ -881,18 +883,19 @@ static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
        return (rec->recovery_lock_handle != NULL);
 }
 
-struct hold_reclock_state {
+struct ctdb_recovery_lock_handle {
        bool done;
        bool locked;
        double latency;
+       struct ctdb_cluster_mutex_handle *h;
 };
 
 static void take_reclock_handler(char status,
                                 double latency,
                                 void *private_data)
 {
-       struct hold_reclock_state *s =
-               (struct hold_reclock_state *) private_data;
+       struct ctdb_recovery_lock_handle *s =
+               (struct ctdb_recovery_lock_handle *) private_data;
 
        switch (status) {
        case '0':
@@ -932,31 +935,41 @@ static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
 {
        struct ctdb_context *ctdb = rec->ctdb;
        struct ctdb_cluster_mutex_handle *h;
-       struct hold_reclock_state s = {
-               .done = false,
-               .locked = false,
-               .latency = 0,
+       struct ctdb_recovery_lock_handle *s;
+
+       s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
+       if (s == NULL) {
+               DBG_ERR("Memory allocation error\n");
+               return false;
        };
 
-       h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
-                              take_reclock_handler, &s,
-                              lost_reclock_handler, rec);
+       h = ctdb_cluster_mutex(s,
+                              ctdb,
+                              ctdb->recovery_lock,
+                              0,
+                              take_reclock_handler,
+                              s,
+                              lost_reclock_handler,
+                              rec);
        if (h == NULL) {
+               talloc_free(s);
                return false;
        }
 
-       while (!s.done) {
+       while (! s->done) {
                tevent_loop_once(ctdb->ev);
        }
 
-       if (! s.locked) {
-               talloc_free(h);
+       if (! s->locked) {
+               talloc_free(s);
                return false;
        }
 
-       rec->recovery_lock_handle = h;
-       ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
-                                          s.latency);
+       rec->recovery_lock_handle = s;
+       s->h = h;
+       ctdb_ctrl_report_recd_lock_latency(ctdb,
+                                          CONTROL_TIMEOUT(),
+                                          s->latency);
 
        return true;
 }
@@ -1315,31 +1328,47 @@ static int do_recovery(struct ctdb_recoverd *rec,
                goto fail;
        }
 
-        if (ctdb->recovery_lock != NULL) {
+       if (ctdb->recovery_lock != NULL) {
                if (ctdb_recovery_have_lock(rec)) {
-                       DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
+                       D_NOTICE("Already holding recovery lock\n");
                } else {
-                       DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
-                                            ctdb->recovery_lock));
-                       if (!ctdb_recovery_lock(rec)) {
-                               if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
-                                       /* If ctdb is trying first recovery, it's
-                                        * possible that current node does not know
-                                        * yet who the recmaster is.
+                       bool ok;
+
+                       D_NOTICE("Attempting to take recovery lock (%s)\n",
+                                ctdb->recovery_lock);
+
+                       ok = ctdb_recovery_lock(rec);
+                       if (! ok) {
+                               D_ERR("Unable to take recovery lock\n");
+
+                               if (pnn != rec->recmaster) {
+                                       D_NOTICE("Recovery master changed to %u,"
+                                                " aborting recovery\n",
+                                                rec->recmaster);
+                                       rec->need_recovery = false;
+                                       goto fail;
+                               }
+
+                               if (ctdb->runstate ==
+                                   CTDB_RUNSTATE_FIRST_RECOVERY) {
+                                       /*
+                                        * First recovery?  Perhaps
+                                        * current node does not yet
+                                        * know who the recmaster is.
                                         */
-                                       DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
-                                                         " - retrying recovery\n"));
+                                       D_ERR("Retrying recovery\n");
                                        goto fail;
                                }
 
-                               DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
-                                                "and ban ourself for %u seconds\n",
-                                                ctdb->tunable.recovery_ban_period));
-                               ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
+                               D_ERR("Abort recovery, "
+                                     "ban this node for %u seconds\n",
+                                     ctdb->tunable.recovery_ban_period);
+                               ctdb_ban_node(rec,
+                                             pnn,
+                                             ctdb->tunable.recovery_ban_period);
                                goto fail;
                        }
-                       DEBUG(DEBUG_NOTICE,
-                             ("Recovery lock taken successfully by recovery daemon\n"));
+                       D_NOTICE("Recovery lock taken successfully\n");
                }
        }