ctdb/eventscripts: Reconfigure lock should be released quickly
authorMartin Schwenke <martin@meltin.net>
Wed, 18 Dec 2013 02:51:22 +0000 (13:51 +1100)
committerAmitay Isaacs <amitay@gmail.com>
Fri, 17 Jan 2014 06:59:26 +0000 (17:59 +1100)
Currently the lock is held until the corresponding eventscript
completes, since the process still exists.  If the regular part of an
eventscript hangs then the lock might unnecessarily be held for a long
time.  The pathological case is when a monitor event gets stuck in
D-wait state and the script times out but can't be killed so the lock
is still held.  This can cause an unwanted monitor replay.

Change this so that the lock is released immediately after the
reconfiguration is complete.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/functions
ctdb/tests/eventscripts/60.nfs.multi.003.sh
ctdb/tests/eventscripts/60.nfs.multi.004.sh
ctdb/tests/eventscripts/60.nfs.multi.005.sh

index 4430d866bff2136a663a49196df8dbc197e67881..374332a6990d00684135742c38681a64d94c22e0 100755 (executable)
@@ -1106,7 +1106,7 @@ service_reconfigure ()
     :
 }
 
-ctdb_reconfigure_try_lock ()
+ctdb_reconfigure_take_lock ()
 {
     _ctdb_service_reconfigure_common
     _lock="${_d}/reconfigure_lock"
@@ -1133,6 +1133,14 @@ ctdb_reconfigure_try_lock ()
     ) <"$_lock"
 }
 
+ctdb_reconfigure_release_lock ()
+{
+    _ctdb_service_reconfigure_common
+    _lock="${_d}/reconfigure_lock"
+
+    rm -f "$_lock"
+}
+
 ctdb_replay_monitor_status ()
 {
     echo "Replaying previous status for this script due to reconfigure..."
@@ -1182,7 +1190,7 @@ ctdb_service_check_reconfigure ()
        *) return 0 ;;
     esac
 
-    if ctdb_reconfigure_try_lock ; then
+    if ctdb_reconfigure_take_lock ; then
        # No events covered by this function are running, so proceed
        # with gay abandon.
        case "$event_name" in
@@ -1196,6 +1204,8 @@ ctdb_service_check_reconfigure ()
                fi
                ;;
        esac
+
+       ctdb_reconfigure_release_lock
     else
        # Somebody else is running an event we don't want to collide
        # with.  We proceed with caution.
index 653dece07a95deeaceab871300c36230e3609cd5..aed1f34d969b441455c99be30d586c848e317928 100755 (executable)
@@ -16,7 +16,7 @@ simple_test_event "takeip" $public_address
 
 ctdb_fake_scriptstatus 1 "ERROR" "$err"
 
-eventscript_call ctdb_reconfigure_try_lock
+eventscript_call ctdb_reconfigure_take_lock
 
 required_result 1 <<EOF
 Replaying previous status for this script due to reconfigure...
index 43323cf61f099a3376f7afda45574a18b13f97a3..e07f8d57bec2faa093ca75ac69da59a9d93e960b 100755 (executable)
@@ -16,7 +16,7 @@ simple_test_event "takeip" $public_address
 
 ctdb_fake_scriptstatus -62 "TIMEDOUT" "$err"
 
-eventscript_call ctdb_reconfigure_try_lock
+eventscript_call ctdb_reconfigure_take_lock
 
 required_result 1 <<EOF
 Replaying previous status for this script due to reconfigure...
index 9816bec83807cbee9f4afe7d1ed454014d067a94..da334fd247d26bb98f7caeb7d42555f074b31c49 100755 (executable)
@@ -16,7 +16,7 @@ simple_test_event "takeip" $public_address
 
 ctdb_fake_scriptstatus -8 "DISABLED" "$err"
 
-eventscript_call ctdb_reconfigure_try_lock
+eventscript_call ctdb_reconfigure_take_lock
 
 ok <<EOF
 Replaying previous status for this script due to reconfigure...