ctdb-scripts: Simplify the names of NFS fail counter files

[samba.git] / ctdb / config / events.d / 60.nfs
diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs

index ff56166ce1cbfc6857eb5a43104b2410446b2fe8..f630a3d861ef080866f33489392a018be45d8531 100755 (executable)
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@@ -1,181 +1,299 @@
  #!/bin/sh
  # script to manage nfs in a clustered environment
  
-start_nfs() {
-       /bin/mkdir -p $CTDB_VARDIR/state/nfs
-       /bin/mkdir -p $CTDB_VARDIR/state/statd/ip
-       startstop_nfs stop
-       startstop_nfs start
-       echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle
-}
+[ -n "$CTDB_BASE" ] || \
+    CTDB_BASE=$(d=$(dirname "$0") ; cd -P "$d" ; dirname "$PWD")
  
-. $CTDB_BASE/functions
+. "${CTDB_BASE}/functions"
  
  service_name="nfs"
-service_start="start_nfs"
-service_stop="startstop_nfs stop"
  
  loadconfig
  
-ctdb_start_stop_service
+ctdb_setup_state_dir "service" "$service_name"
+
+######################################################################
+
+service_reconfigure ()
+{
+    # Restart lock manager, notify clients
+    if [ -x "${CTDB_BASE}/statd-callout" ] ; then
+       "${CTDB_BASE}/statd-callout" notify &
+    fi >/dev/null 2>&1
+}
+
+######################################################################
+
+######################################################
+# Check the health of NFS services
+#
+# Use .check files in $CTDB_NFS_CHECKS_DIR.
+# Default is "${CTDB_BASE}/nfs-checks.d/"
+######################################################
+nfs_check_services ()
+{
+    _dir="${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}"
+
+    # Files must end with .check - avoids editor backups, RPM fu, ...
+    for _f in "$_dir"/[0-9][0-9].*.check ; do
+       [ -r "$_f" ] || continue
+
+       _t="${_f%.check}"
+       _progname="${_t##*/[0-9][0-9].}"
+
+       nfs_check_service "$_progname" <"$_f"
+    done
+}
+
+######################################################
+# Check the health of an NFS service
+#
+# $1 - progname, passed to rpcinfo (looked up in /etc/rpc)
+#
+# Reads variables from stdin
+#
+# Variables are:
+#
+# * family             - "tcp" or "udp" or space separated list
+#                        default: tcp, not used with "service_check_cmd"
+# * version            - optional, RPC service version number
+#                        default is to omit to check for any version,
+#                        not used with "service_check_cmd"
+# * unhealthy_after    - number of check fails before unhealthy
+#                        default: 1
+# * restart_every      - number of check fails before restart
+#                        default: 0, meaning no restart
+# * service_stop_cmd   - command to stop service
+#                        default: no default, must be provided if
+#                                 restart_every > 0
+# * service_start_cmd  - command to start service
+#                        default: no default, must be provided if
+#                                 restart_every > 0
+# * service_check_cmd  - command to check health of service
+#                        default is to check RPC service using rpcinfo
+# * service_debug_cmd  - command to debug a service after trying to stop it;
+#                        for example, it can be useful to print stack
+#                        traces of threads that have not exited, since
+#                        they may be stuck doing I/O;
+#                        no default, see also function program_stack_traces()
+#
+# Quoting in values is not preserved
+#
+######################################################
+nfs_check_service ()
+{
+    _progname="$1"
+
+    # This sub-shell is created to intentionally limit the scope of
+    # variable values read from the .check files.
+    # shellcheck disable=SC2030
+    (
+       # Subshell to restrict scope variables...
+
+       # Defaults
+       family="tcp"
+       version=""
+       unhealthy_after=1
+       restart_every=0
+       service_stop_cmd=""
+       service_start_cmd=""
+       service_check_cmd=""
+       service_debug_cmd=""
+
+       # Eval line-by-line.  Expands variable references in values.
+       # Also allows variable name checking, which seems useful.
+       while read _line ; do
+           case "$_line" in
+               \#*|"") : ;; # Ignore comments, blank lines
+
+               family=*|version=*|\
+               unhealthy_after=*|restart_every=*|\
+               service_stop_cmd=*|service_start_cmd=*|\
+               service_check_cmd=*|service_debug_cmd=*)
+
+                   eval "$_line"
+                   ;;
+               *)
+                   echo "ERROR: Unknown variable for ${_progname}: ${_line}"
+                   exit 1
+           esac
+       done
+
+       _ok=false
+       if [ -n "$service_check_cmd" ] ; then
+           # Using eval means variables can contain semicolon separated commands
+           if eval "$service_check_cmd" ; then
+               _ok=true
+           else
+               _err="monitoring service \"${_progname}\" failed"
+           fi
+       else
+           if nfs_check_rpcinfo \
+                  "$_progname" "$version" "$family" >/dev/null ; then
+               _ok=true
+           else
+               _err="$ctdb_check_rpc_out"
+           fi
+       fi
+
+       if $_ok ; then
+           if [ $unhealthy_after -ne 1 -o $restart_every -ne 0 ] ; then
+               ctdb_counter_init "$_progname"
+           fi
+           exit 0
+       fi
+
+       ctdb_counter_incr "$_progname"
+       _failcount=$(ctdb_counter_get "$_progname")
+
+       _unhealthy=false
+       if [ "$unhealthy_after" -gt 0 ] ; then
+           if [ "$_failcount" -ge "$unhealthy_after" ] ; then
+               _unhealthy=true
+               echo "ERROR: $_err"
+           fi
+       fi
+
+       if [ "$restart_every" -gt 0 ] ; then
+           if [ $((_failcount % restart_every)) -eq 0 ] ; then
+               if ! $_unhealthy ; then
+                   echo "WARNING: $_err"
+               fi
+               nfs_restart_service
+           fi
+       fi
+
+       if $_unhealthy ; then
+           exit 1
+       fi
+
+       return 0
+    ) || exit 1
+}
+
+# Uses: service_stop_cmd, service_start_cmd, service_debug_cmd
+# This function is called within the sub-shell that shellcheck thinks
+# loses the above variable values.
+# shellcheck disable=SC2031
+nfs_restart_service ()
+{
+    if [ -z "$service_stop_cmd" -o -z "$service_start_cmd" ] ; then
+       die "ERROR: Can not restart service \"${_progname}\" without corresponding service_start_cmd/service_stop_cmd settings"
+    fi
+
+    echo "Trying to restart service \"${_progname}\"..."
+    # Using eval means variables can contain semicolon separated commands
+    eval "$service_stop_cmd"
+    if [ -n "$service_debug_cmd" ] ; then
+       eval "$service_debug_cmd"
+    fi
+    background_with_logging eval "$service_start_cmd"
+}
+
+######################################################
+# Check an RPC service with rpcinfo
+######################################################
+ctdb_check_rpc ()
+{
+    _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
+    _version="$2"         # optional, not passed if empty/unset
+    _family="${3:-tcp}"   # optional, default is "tcp"
+
+    case "$_family" in
+       tcp6|udp6)
+           _localhost="${CTDB_RPCINFO_LOCALHOST6:-::1}"
+           ;;
+       *)
+           _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
+    esac
+
+    # $_version is not quoted because it is optional
+    # shellcheck disable=SC2086
+    if ! ctdb_check_rpc_out=$(rpcinfo -T "$_family" "$_localhost" \
+                                     "$_progname" $_version 2>&1) ; then
+       ctdb_check_rpc_out="$_progname failed RPC check:
+$ctdb_check_rpc_out"
+       echo "$ctdb_check_rpc_out"
+       return 1
+    fi
+}
+
+nfs_check_rpcinfo ()
+{
+    _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
+    _versions="$2"        # optional, space separated, not passed if empty/unset
+    _families="${3:-tcp}" # optional, space separated, default is "tcp"
+
+    for _family in $_families ; do
+       if [ -n "$_versions" ] ; then
+           for _version in $_versions ; do
+               ctdb_check_rpc "$_progname" "$_version" "$_family" || return $?
+           done
+       else
+           ctdb_check_rpc "$_progname" "" "$_family" || return $?
+       fi
+    done
+}
+
+##################################################################
+# use statd-callout to update NFS lock info
+##################################################################
+nfs_update_lock_info ()
+{
+    if [ -x "$CTDB_BASE/statd-callout" ] ; then
+       "$CTDB_BASE/statd-callout" update
+    fi
+}
+
+######################################################################
+
+# script_state_dir set by ctdb_setup_state_dir()
+# shellcheck disable=SC2154
+nfs_callout_init "$script_state_dir"
  
  is_ctdb_managed_service || exit 0
  
-case "$1" in 
-     init)
-       # read statd from persistent database
-       ;;
-     startup)
-       ctdb_service_start
-       mkdir -p $CTDB_VARDIR/state/statd
-       touch $CTDB_VARDIR/state/statd/update-trigger
+case "$1" in
+startup)
+       nfs_callout "$@" || exit $?
         ;;
  
-     shutdown)
-       ctdb_service_stop
+shutdown)
+       nfs_callout "$@" || exit $?
         ;;
  
-     takeip)
+takeip)
+       nfs_callout "$@" || exit $?
         ctdb_service_set_reconfigure
         ;;
  
-     releaseip)
+releaseip)
+       nfs_callout "$@" || exit $?
         ctdb_service_set_reconfigure
         ;;
  
-      monitor)
+ipreallocated)
         if ctdb_service_needs_reconfigure ; then
-           ctdb_service_reconfigure
-           exit 0
+               ctdb_service_reconfigure
         fi
+       ;;
  
-       update_tickles 2049
+monitor)
+       nfs_callout "monitor-pre" || exit $?
  
-       # check that statd responds to rpc requests
-       # if statd is not running we try to restart it
-       if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
-               (service_name="nfs_statd"; ctdb_counter_init)
-       else
-               p="rpc.statd" ; cmd="$p"
-               cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
-               cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
-               cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
-               (
-                       service_name="nfs_statd"
-                       ctdb_counter_incr
-                       ctdb_check_counter_limit 10 quiet >/dev/null
-               ) || {
-                       echo "$ctdb_check_rpc_out"
-                       echo "Trying to restart STATD [$cmd]"
-               }
-               $cmd
+       # Check that directories for shares actually exist
+       if [ "$CTDB_NFS_SKIP_SHARE_CHECK" != "yes" ] ; then
+           nfs_callout "monitor-list-shares" | ctdb_check_directories || \
+               exit $?
         fi
  
+       update_tickles 2049
+       nfs_update_lock_info
  
-       # check that NFS responds to rpc requests
-       [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-           if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
-               (service_name="nfs_knfsd"; ctdb_counter_init)
-           else
-               (
-                       service_name="nfs_knfsd"
-                       ctdb_counter_incr
-
-                       ctdb_check_counter_equal 10 || {
-                               echo "Trying to restart NFS service"
-                               startstop_nfs restart >/dev/null 2>&1 &
-                               exit 0
-                       }
-
-                       ctdb_check_counter_limit 15 quiet >/dev/null
-               ) || {
-                       echo "$ctdb_check_rpc_out"
-                       echo "Trying to restart NFS service"
-                       startstop_nfs restart
-                       exit 1
-               }
-           fi
-       }
-
-       # and that its directories are available
-       [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
-           exportfs | grep -v '^#' | grep '^/' |
-           sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
-           ctdb_check_directories
-       } || exit $?
-
-       # check that lockd responds to rpc requests
-       ctdb_check_rpc "LOCKD" 100021 1 || {
-               echo "Trying to restart lock manager service"
-               startstop_nfs restart
-               startstop_nfslock restart
-               exit 1
-       }
-
-       # mount needs special handling since it is sometimes not started
-       # correctly on RHEL5
-       if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
-               (service_name="nfs_mountd"; ctdb_counter_init)
-       else
-       (
-               service_name="nfs_mountd"
-               ctdb_counter_incr
-
-               ctdb_check_counter_equal 5 || {
-                       p="rpc.mountd"
-                       cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-                       echo "Trying to restart MOUNTD [${cmd}]"
-                       killall -q -9 $p
-                       $cmd &
-                       exit 0
-               }
-
-               ctdb_check_counter_limit 10 quiet >/dev/null
-       ) || {
-               echo "$ctdb_check_rpc_out"
-               p="rpc.mountd"
-               cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-               echo "Trying to restart MOUNTD [${cmd}]"
-               killall -q -9 $p
-               $cmd &
-               exit 1
-       }
-       fi
-
+       nfs_check_services
  
-       # rquotad needs special handling since it is sometimes not started
-       # correctly on RHEL5
-       # this is not a critical service so we dont flag the node as unhealthy
-       ctdb_check_rpc "RQUOTAD" 100011 1 || {
-               p="rpc.rquotad"
-               cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
-               echo "Trying to restart RQUOTAD [${cmd}]"
-               killall -q -9 $p
-               $cmd &
-       }
-
-       # once every 60 seconds, update the statd state database for which
-       # clients need notifications
-       LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger`
-       CURRENT_TIME=`date +"%s"`
-       [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && {
-           mkdir -p $CTDB_VARDIR/state/statd
-           touch $CTDB_VARDIR/state/statd/update-trigger
-           $CTDB_BASE/statd-callout updatelocal &
-           $CTDB_BASE/statd-callout updateremote &
-       }
+       nfs_callout "monitor-post" || exit $?
                 ;;
-
-    ipreallocated)
-       # if the ips have been reallocated, we must restart the lockmanager
-       # across all nodes and ping all statd listeners
-       [ -x $CTDB_BASE/statd-callout ] && {
-               $CTDB_BASE/statd-callout notify &
-       } >/dev/null 2>&1
-       ;;
-    *)
-       ctdb_standard_event_handler "$@"
-       ;;
  esac
  
  exit 0