Eventscripts: In 60.nfs don't restart NFS when restarting rpc.lockd.

[ctdb.git] / config / functions
diff --git a/config/functions b/config/functions

index 452b8d0d976d5edb34497ae2810bfb3e9502f317..b04965281dbfcfe987dab89fab67a3ce5654f523 100755 (executable)
--- a/config/functions
+++ b/config/functions
@@ -105,18 +105,140 @@ get_proc ()
      cat "/proc/$1"
  }
  
+######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a 
+# fail count limit and an action string.
+#
+# For example:
+#
+#      nfs_check_rpc_service "lockd" \
+#          -ge 15 "verbose restart unhealthy" \
+#          -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+    _prog_name="$1" ; shift
+
+    _version=1
+    _rpc_prog="$_prog_name"
+    _restart=""
+    _opts=""
+    case "$_prog_name" in
+       knfsd)
+           _rpc_prog=nfs
+           _version=3
+           _restart="echo 'Trying to restart NFS service'"
+           _restart="${_restart}; startstop_nfs restart"
+           ;;
+       mountd)
+           _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+           ;;
+       rquotad)
+           _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+           ;;
+       lockd)
+           _rpc_prog=nlockmgr
+           _version=4
+           _restart="echo 'Trying to restart lock manager service'"
+           _restart="${_restart}; startstop_nfslock restart"
+           ;;
+       statd)
+           _rpc_prog=status
+           _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+           _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+           _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+           ;;
+       *)
+           echo "Internal error: unknown RPC program \"$_prog_name\"."
+           exit 1
+    esac
+
+    _service_name="nfs_${_prog_name}"
+
+    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+       ctdb_counter_init "$_service_name"
+       return 0
+    fi
+
+    ctdb_counter_incr "$_service_name"
+
+    while [ -n "$3" ] ; do
+       ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+           for _action in $3 ; do
+               case "$_action" in
+                   verbose)
+                       echo "$ctdb_check_rpc_out"
+                       ;;
+                   restart|restart:*)
+                       # No explicit command specified, construct rpc command.
+                       if [ -z "$_restart" ] ; then
+                           _p="rpc.${_prog_name}"
+                           _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+                           _restart="${_restart}; killall -q -9 $_p"
+                           _restart="${_restart}; $_p $_opts"
+                       fi
+
+                       # Process restart flags...
+                       _flags="${_action#restart:}"
+                       # There may not have been a colon...
+                       [ "$_flags" != "$_action" ] || _flags=""
+                       # q=quiet - everything to /dev/null
+                       if [ "${_flags#*q}" != "$_flags" ] ; then
+                           _restart="{ ${_restart} ; } >/dev/null 2>&1"
+                       fi
+                       # s=stealthy - last command to /dev/null
+                       if [ "${_flags#*s}" != "$_flags" ] ; then
+                           _restart="${_restart} >/dev/null 2>&1"
+                       fi
+                       # b=background - the whole thing, easy and reliable
+                       if [ "${_flags#*b}" != "$_flags" ] ; then
+                           _restart="{ ${_restart} ; } &"
+                       fi
+
+                       # Do it!
+                       eval "${_restart}"
+                       ;;
+                   unhealthy)
+                       exit 1
+                       ;;
+                   *)
+                       echo "Internal error: unknown action \"$_action\"."
+                       exit 1
+               esac
+           done
+
+           # Only process the first action group.
+           break
+       }
+       shift 3
+    done
+}
+
  ######################################################
  # check that a rpc server is registered with portmap
  # and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
  ######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
+{
      progname="$1"
-    prognum="$2"
-    version="$3"
+    version="$2"
  
-    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
-    if [ $? -ne 0 ] ; then
+    if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
  $ctdb_check_rpc_out"
         echo "$ctdb_check_rpc_out"
@@ -355,8 +477,8 @@ startstop_nfs() {
                         service nfs start
                         ;;
                 stop)
-                       service nfs stop > /dev/null 2>&1
-                       service nfslock stop > /dev/null 2>&1
+                       service nfs stop
+                       service nfslock stop
                         ;;
                 restart)
                         set_proc "fs/nfsd/threads" 0
@@ -425,18 +547,6 @@ startstop_nfslock() {
         esac
  }
  
-# better use delete_ip_from_iface() together with add_ip_to_iface
-# remove_ip should be removed in future
-remove_ip() {
-       local _ip_maskbits=$1
-       local _iface=$2
-       local _ip=`echo "$_ip_maskbits" | cut -d '/' -f1`
-       local _maskbits=`echo "$_ip_maskbits" | cut -d '/' -f2`
-
-       delete_ip_from_iface "$_iface" "$_ip" "$_maskbits"
-       return $?
-}
-
  add_ip_to_iface()
  {
         local _iface=$1
@@ -515,23 +625,23 @@ setup_iface_ip_readd_script()
  # ctdb_check_counter_limit succeeds when count >= <limit>
  ########################################################
  _ctdb_counter_common () {
-    _service_name="${$1:-${service_name}}"
+    _service_name="${1:-${service_name}}"
      _counter_file="$ctdb_fail_dir/$_service_name"
      mkdir -p "${_counter_file%/*}" # dirname
  }
  ctdb_counter_init () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common "$1"
  
      >"$_counter_file"
  }
  ctdb_counter_incr () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common "$1"
  
      # unary counting!
      echo -n 1 >> "$_counter_file"
  }
  ctdb_check_counter_limit () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common
  
      _limit="${1:-${service_fail_limit}}"
      _quiet="$2"
@@ -546,7 +656,7 @@ ctdb_check_counter_limit () {
      fi
  }
  ctdb_check_counter_equal () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common
  
      _limit=$1
  
@@ -557,6 +667,24 @@ ctdb_check_counter_equal () {
      fi
      return 0
  }
+ctdb_check_counter () {
+    _msg="${1:-error}"  # "error"  - anything else is silent on fail
+    _op="${2:--ge}"  # an integer operator supported by test
+    _limit="${3:-${service_fail_limit}}"
+    shift 3
+    _ctdb_counter_common "$1"
+
+    # unary counting!
+    _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+    if [ $_size $_op $_limit ] ; then
+       if [ "$_msg" = "error" ] ; then
+           echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy"
+           exit 1              
+       else
+           return 1
+       fi
+    fi
+}
  
  ########################################################
  
@@ -639,35 +767,76 @@ ctdb_setstatus ()
      esac
  }
  
+##################################################################
+# Reconfigure a service on demand
+
+_ctdb_service_reconfigure_common ()
+{
+    _d="$ctdb_status_dir/${1:-${service_name}}"
+    mkdir -p "$_d"
+    _ctdb_service_reconfigure_flag="$_d/reconfigure"
+}
+
  ctdb_service_needs_reconfigure ()
  {
-    [ -e "$ctdb_status_dir/$service_name/reconfigure" ]
+    _ctdb_service_reconfigure_common "$@"
+    [ -e "$_ctdb_service_reconfigure_flag" ]
  }
  
  ctdb_service_set_reconfigure ()
  {
-    d="$ctdb_status_dir/$service_name"
-    mkdir -p "$d"
-    >"$d/reconfigure"
+    _ctdb_service_reconfigure_common "$@"
+    >"$_ctdb_service_reconfigure_flag"
  }
  
  ctdb_service_unset_reconfigure ()
  {
-    rm -f "$ctdb_status_dir/$service_name/reconfigure"
+    _ctdb_service_reconfigure_common "$@"
+    rm -f "$_ctdb_service_reconfigure_flag"
  }
  
  ctdb_service_reconfigure ()
  {
      echo "Reconfiguring service \"$service_name\"..."
-    if [ -n "$service_reconfigure" ] ; then
-       eval $service_reconfigure
-    else
-       service "$service_name" restart
+    ctdb_service_unset_reconfigure "$@"
+    service_reconfigure "$@" || return $?
+    ctdb_counter_init "$@"
+}
+
+# Default service_reconfigure() function.
+service_reconfigure ()
+{
+    service "${1:-$service_name}" restart
+}
+
+ctdb_service_check_reconfigure ()
+{
+    # Only do this for certain events.
+    case "$event_name" in
+       monitor|ipreallocated) : ;;
+       *) return 0
+    esac
+
+    if ctdb_service_needs_reconfigure "$@" ; then
+       ctdb_service_reconfigure "$@"
+
+       # Fall through to non-monitor events.
+       [ "$event_name" = "monitor" ] || return 0
+
+       # We don't want to proceed with the rest of the monitor event
+       # here, so we exit.  However, if we exit 0 then, if the
+       # service was previously broken, we might return a false
+       # positive.  So we simply retrieve the status of this script
+       # from the previous monitor loop and exit with that status.
+       ctdb scriptstatus | \
+           grep -q -E "^${script_name}[[:space:]]+Status:OK[[:space:]]"
+       exit $?
      fi
-    ctdb_service_unset_reconfigure
-    ctdb_counter_init
  }
  
+##################################################################
+# Does CTDB manage this service? - and associated auto-start/stop
+
  ctdb_compat_managed_service ()
  {
      if [ "$1" = "yes" -a "$2" = "$_service_name" ] ; then
@@ -714,23 +883,24 @@ ctdb_start_stop_service ()
  
      if is_ctdb_managed_service "$_service_name" ; then
         if ! is_ctdb_previously_managed_service "$_service_name" ; then
-           echo "Starting service $_service_name"
-           ctdb_service_start || exit $?
-           ctdb_service_managed "$_service_name"
-           exit 0
+           echo "Starting service \"$_service_name\" - now managed"
+           ctdb_service_start "$_service_name"
+           exit $?
         fi
      else
         if is_ctdb_previously_managed_service "$_service_name" ; then
-           echo "Stopping service $_service_name"
-           ctdb_service_stop || exit $?
-           ctdb_service_unmanaged "$_service_name"
-           exit 0
+           echo "Stopping service \"$_service_name\" - no longer managed"
+           ctdb_service_stop "$_service_name"
+           exit $?
         fi
      fi
  }
  
  ctdb_service_start ()
  {
+    # The service is marked managed if we've ever tried to start it.
+    ctdb_service_managed "$@"
+
      # Here we only want $1.  If no argument is passed then
      # service_start needs to know.
      service_start "$@" || return $?
@@ -740,6 +910,7 @@ ctdb_service_start ()
  
  ctdb_service_stop ()
  {
+    ctdb_service_unmanaged "$@"
      service_stop "$@"
  }