Merge remote branch 'martins/eventscript.10.interface'
authorRonnie Sahlberg <ronniesahlberg@gmail.com>
Mon, 15 Aug 2011 05:27:50 +0000 (15:27 +1000)
committerRonnie Sahlberg <ronniesahlberg@gmail.com>
Mon, 15 Aug 2011 05:27:50 +0000 (15:27 +1000)
(This used to be ctdb commit 0d17daab38d4086f922a8006d4c545133adca191)

27 files changed:
ctdb/config/events.d/60.nfs
ctdb/config/functions
ctdb/server/ctdb_tunables.c
ctdb/tests/eventscripts/simple/50.samba.monitor.001.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.050.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.051.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.101.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.102.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.103.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.104.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.105.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/50.samba.monitor.106.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh [changed mode: 0644->0755]
ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh [changed mode: 0644->0755]
ctdb/tests/simple/02_ctdb_listvars.sh

index e77804836ba2c260197b8cd3bbc797f645a945ce..19a9ea81a1f2ebaf4acde07cffcbe2dfd07bfe65 100755 (executable)
@@ -59,11 +59,11 @@ case "$1" in
        ;;
 
       monitor)
-       # and that its directories are available
+       # Check that directories for shares actually exist.
        [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
-           exportfs | grep -v '^#' | grep '^/' |
-           sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
-           ctdb_check_directories
+           exportfs -v | grep '^/' | 
+           sed -r -e 's@[[:space:]]+[^[:space:]()]+\([^[:space:]()]+\)$@@' | 
+           ctdb_check_directories 
        } || exit $?
 
        update_tickles 2049
@@ -73,118 +73,35 @@ case "$1" in
        # we only do this IF we have a rpc.statd command.
        # For platforms where rpc.statd does not exist, we skip
        # the check completely
-       p="rpc.statd"
-       which $p >/dev/null 2>/dev/null && {
-               if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
-                       (service_name="nfs_statd"; ctdb_counter_init)
-               else
-                       cmd="$p"
-                       cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
-                       cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
-                       cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
-                       (
-                               service_name="nfs_statd"
-                               ctdb_counter_incr
-                               ctdb_check_counter_limit 10 quiet >/dev/null
-                       ) || {
-                               echo "$ctdb_check_rpc_out"
-                               echo "Trying to restart STATD [$cmd]"
-                               $cmd
-                       }
-               fi
-       }
+        p="rpc.statd"
+        which $p >/dev/null 2>/dev/null && \
+           nfs_check_rpc_service "statd" \
+               -ge 6 "verbose unhealthy" \
+               -eq 4 "verbose restart" \
+               -eq 2 "restart:bs"
 
        # check that NFS responds to rpc requests
-       [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-           if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
-               (service_name="nfs_knfsd"; ctdb_counter_init)
-           else
-               (
-                       service_name="nfs_knfsd"
-                       ctdb_counter_incr
-
-                       ctdb_check_counter_equal 2 || {
-                               echo "Trying to restart NFS service"
-                               startstop_nfs restart >/dev/null 2>&1 &
-                               exit 0
-                       }
-
-                       ctdb_check_counter_limit 5 quiet >/dev/null
-               ) || {
-                       echo "$ctdb_check_rpc_out"
-                       echo "Trying to restart NFS service"
-                       startstop_nfs restart
-                       exit 1
-               }
-           fi
-       }
-
-       # check that lockd responds to rpc requests
-       if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then
-               (service_name="lockd"; ctdb_counter_init)
-       else
-               (
-                       service_name="lockd"
-                       ctdb_counter_incr
-
-                       ctdb_check_counter_equal 10 || {
-                               echo "Trying to restart NFS lock service"
-                               startstop_nfs restart >/dev/null 2>&1 &
-                               startstop_nfslock restart  >/dev/null 2>&1 &
-                               exit 0
-                       }
-
-                       ctdb_check_counter_limit 15 quiet >/dev/null
-       ) || {
-                       echo "$ctdb_check_rpc_out"
-                       echo "Trying to restart NFS lock service"
-                       startstop_nfs restart
-                       startstop_nfslock restart
-                       exit 1
-               }
-       fi
-
-       # mount needs special handling since it is sometimes not started
-       # correctly on RHEL5
-       if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
-               (service_name="nfs_mountd"; ctdb_counter_init)
-       else
-       (
-               service_name="nfs_mountd"
-               ctdb_counter_incr
-
-               ctdb_check_counter_equal 5 || {
-                       p="rpc.mountd"
-                       cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-                       echo "Trying to restart MOUNTD [${cmd}]"
-                       killall -q -9 $p
-                       $cmd &
-                       exit 0
-               }
-
-               ctdb_check_counter_limit 10 quiet >/dev/null
-       ) || {
-               echo "$ctdb_check_rpc_out"
-               p="rpc.mountd"
-               cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-               echo "Trying to restart MOUNTD [${cmd}]"
-               killall -q -9 $p
-               $cmd &
-               exit 1
-       }
+       if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
+           nfs_check_rpc_service "knfsd" \
+               -ge 6 "verbose unhealthy" \
+               -eq 4 "verbose restart" \
+               -eq 2 "restart:bs"
        fi
 
-
-       # rquotad needs special handling since it is sometimes not started
-       # correctly on RHEL5
-       # this is not a critical service so we dont flag the node as unhealthy
-       ctdb_check_rpc "RQUOTAD" 100011 1 || {
-               p="rpc.rquotad"
-               cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
-               echo "Trying to restart RQUOTAD [${cmd}]"
-               killall -q -9 $p
-               $cmd &
-       }
+       # check that lockd responds to rpc requests
+       nfs_check_rpc_service "lockd" \
+           -ge 15 "verbose restart unhealthy" \
+           -eq 10 "restart:bs"
+
+       # mountd is sometimes not started correctly on RHEL5
+       nfs_check_rpc_service "mountd" \
+           -ge 10 "verbose restart:b unhealthy" \
+           -eq 5 "restart:b"
+
+       # rquotad is sometimes not started correctly on RHEL5
+       # not a critical service so we dont flag the node as unhealthy
+       nfs_check_rpc_service "rquotad" \
+           -gt 0 "verbose restart:b"
 
        # once every 600 seconds, update the statd state database for which
        # clients need notifications
index 2668531ca834d305aa354a0fe293e6fca25fc9ae..b04965281dbfcfe987dab89fab67a3ce5654f523 100755 (executable)
@@ -105,18 +105,140 @@ get_proc ()
     cat "/proc/$1"
 }
 
+######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a 
+# fail count limit and an action string.
+#
+# For example:
+#
+#      nfs_check_rpc_service "lockd" \
+#          -ge 15 "verbose restart unhealthy" \
+#          -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+    _prog_name="$1" ; shift
+
+    _version=1
+    _rpc_prog="$_prog_name"
+    _restart=""
+    _opts=""
+    case "$_prog_name" in
+       knfsd)
+           _rpc_prog=nfs
+           _version=3
+           _restart="echo 'Trying to restart NFS service'"
+           _restart="${_restart}; startstop_nfs restart"
+           ;;
+       mountd)
+           _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+           ;;
+       rquotad)
+           _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+           ;;
+       lockd)
+           _rpc_prog=nlockmgr
+           _version=4
+           _restart="echo 'Trying to restart lock manager service'"
+           _restart="${_restart}; startstop_nfslock restart"
+           ;;
+       statd)
+           _rpc_prog=status
+           _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+           _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+           _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+           ;;
+       *)
+           echo "Internal error: unknown RPC program \"$_prog_name\"."
+           exit 1
+    esac
+
+    _service_name="nfs_${_prog_name}"
+
+    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+       ctdb_counter_init "$_service_name"
+       return 0
+    fi
+
+    ctdb_counter_incr "$_service_name"
+
+    while [ -n "$3" ] ; do
+       ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+           for _action in $3 ; do
+               case "$_action" in
+                   verbose)
+                       echo "$ctdb_check_rpc_out"
+                       ;;
+                   restart|restart:*)
+                       # No explicit command specified, construct rpc command.
+                       if [ -z "$_restart" ] ; then
+                           _p="rpc.${_prog_name}"
+                           _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+                           _restart="${_restart}; killall -q -9 $_p"
+                           _restart="${_restart}; $_p $_opts"
+                       fi
+
+                       # Process restart flags...
+                       _flags="${_action#restart:}"
+                       # There may not have been a colon...
+                       [ "$_flags" != "$_action" ] || _flags=""
+                       # q=quiet - everything to /dev/null
+                       if [ "${_flags#*q}" != "$_flags" ] ; then
+                           _restart="{ ${_restart} ; } >/dev/null 2>&1"
+                       fi
+                       # s=stealthy - last command to /dev/null
+                       if [ "${_flags#*s}" != "$_flags" ] ; then
+                           _restart="${_restart} >/dev/null 2>&1"
+                       fi
+                       # b=background - the whole thing, easy and reliable
+                       if [ "${_flags#*b}" != "$_flags" ] ; then
+                           _restart="{ ${_restart} ; } &"
+                       fi
+
+                       # Do it!
+                       eval "${_restart}"
+                       ;;
+                   unhealthy)
+                       exit 1
+                       ;;
+                   *)
+                       echo "Internal error: unknown action \"$_action\"."
+                       exit 1
+               esac
+           done
+
+           # Only process the first action group.
+           break
+       }
+       shift 3
+    done
+}
+
 ######################################################
 # check that a rpc server is registered with portmap
 # and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
 ######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
+{
     progname="$1"
-    prognum="$2"
-    version="$3"
+    version="$2"
 
-    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
-    if [ $? -ne 0 ] ; then
+    if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
        ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 $ctdb_check_rpc_out"
        echo "$ctdb_check_rpc_out"
index bd7834daad0d228aedffe8dc78f2f51876511687..9da3cc806562f1601797ecf040b0a5762a6de39c 100644 (file)
@@ -45,8 +45,8 @@ static const struct {
        { "DatabaseMaxDead",      5,  offsetof(struct ctdb_tunable, database_max_dead) },
        { "RerecoveryTimeout",   10,  offsetof(struct ctdb_tunable, rerecovery_timeout) },
        { "EnableBans",           1,  offsetof(struct ctdb_tunable, enable_bans) },
-       { "DeterministicIPs",     1,  offsetof(struct ctdb_tunable, deterministic_public_ips) },
-       { "LCP2PublicIPs",        0,  offsetof(struct ctdb_tunable, lcp2_public_ip_assignment) },
+       { "DeterministicIPs",     0,  offsetof(struct ctdb_tunable, deterministic_public_ips) },
+       { "LCP2PublicIPs",        1,  offsetof(struct ctdb_tunable, lcp2_public_ip_assignment) },
        { "ReclockPingPeriod",   60,  offsetof(struct ctdb_tunable,  reclock_ping_period) },
        { "NoIPFailback",         0,  offsetof(struct ctdb_tunable, no_ip_failback) },
        { "DisableIPFailover",    0,  offsetof(struct ctdb_tunable, disable_ip_failover) },
old mode 100644 (file)
new mode 100755 (executable)
index 924d953..8854780
@@ -7,12 +7,6 @@ define_test "port 445 down"
 setup_samba
 tcp_port_down 445
 
-required_result 1 <<EOF
-ERROR: samba tcp port 445 is not responding
-netstat -l -t -n shows this output:
-Active Internet connections (servers only)
-Proto Recv-Q Send-Q Local Address           Foreign Address         State
-tcp        0      0 0.0.0.0:139             0.0.0.0:*               LISTEN
-EOF
+required_result 1 "ERROR: samba tcp port 445 is not responding"
 
 simple_test
old mode 100644 (file)
new mode 100755 (executable)
index c05b8d4..a29e783
@@ -7,12 +7,6 @@ define_test "port 139 down"
 setup_samba
 tcp_port_down 139
 
-required_result 1 <<EOF
-ERROR: samba tcp port 139 is not responding
-netstat -l -t -n shows this output:
-Active Internet connections (servers only)
-Proto Recv-Q Send-Q Local Address           Foreign Address         State
-tcp        0      0 0.0.0.0:445             0.0.0.0:*               LISTEN
-EOF
+required_result 1 "ERROR: samba tcp port 139 is not responding"
 
 simple_test
index 093c1578f474e360c8d2d81112b0decfc76e4607..13beaec6393fefebd5430a4e36954c358fd5f7de 100755 (executable)
@@ -37,5 +37,5 @@ try_command_on_node -v 0 "$CTDB listvars"
 
 sanity_check_output \
     5 \
-    '^[[:alpha:]]+[[:space:]]*=[[:space:]]*[[:digit:]]+$' \
+    '^[[:alpha:]][[:alnum:]]+[[:space:]]*=[[:space:]]*[[:digit:]]+$' \
     "$out"