ctdb-recoverd: No longer take cluster lock during recovery
authorMartin Schwenke <martin@meltin.net>
Mon, 4 May 2020 07:45:51 +0000 (17:45 +1000)
committerMartin Schwenke <martins@samba.org>
Mon, 17 Jan 2022 10:21:33 +0000 (10:21 +0000)
Confirm instead that it is already held.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/server/ctdb_recoverd.c

index f7572bfa5c93607078ef835769f985b72104574d..9a1a2b9ca1c4161b31b11e9465b917984b396d1f 100644 (file)
@@ -1286,25 +1286,18 @@ static int do_recovery(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
                goto fail;
        }
 
-       if (cluster_lock_enabled(rec)) {
-               bool ok;
-
-               ok = cluster_lock_take(rec);
-               if (!ok) {
-                       D_ERR("Unable to take recovery lock\n");
-
-                       if (!this_node_is_leader(rec)) {
-                               D_NOTICE("Leader changed to %u,"
-                                        " aborting recovery\n",
-                                        rec->leader);
-                               rec->need_recovery = false;
-                               goto fail;
-                       }
-
-                       D_ERR("Abort recovery, ban this node\n");
-                       ctdb_ban_node(rec, rec->pnn);
+       if (cluster_lock_enabled(rec) && !cluster_lock_held(rec)) {
+               /* Leader can change in ban_misbehaving_nodes() */
+               if (!this_node_is_leader(rec)) {
+                       D_NOTICE("Leader changed to %u, aborting recovery\n",
+                                rec->leader);
+                       rec->need_recovery = false;
                        goto fail;
                }
+
+               D_ERR("Cluster lock not held - abort recovery, ban node\n");
+               ctdb_ban_node(rec, rec->pnn);
+               goto fail;
        }
 
        DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));