X-Git-Url: http://git.samba.org/?a=blobdiff_plain;f=config%2Ffunctions;h=b04965281dbfcfe987dab89fab67a3ce5654f523;hb=380c9263eb37db5a250264316e250c2160908263;hp=452b8d0d976d5edb34497ae2810bfb3e9502f317;hpb=86e4aefed9fd1028660c98e3ea758c2b75ffc1d8;p=ctdb.git

diff --git a/config/functions b/config/functions
index 452b8d0d..b0496528 100755
--- a/config/functions
+++ b/config/functions
@@ -105,18 +105,140 @@ get_proc ()
     cat "/proc/$1"
 }
 
+######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a 
+# fail count limit and an action string.
+#
+# For example:
+#
+# 	nfs_check_rpc_service "lockd" \
+#	    -ge 15 "verbose restart unhealthy" \
+#	    -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+    _prog_name="$1" ; shift
+
+    _version=1
+    _rpc_prog="$_prog_name"
+    _restart=""
+    _opts=""
+    case "$_prog_name" in
+	knfsd)
+	    _rpc_prog=nfs
+	    _version=3
+	    _restart="echo 'Trying to restart NFS service'"
+	    _restart="${_restart}; startstop_nfs restart"
+	    ;;
+	mountd)
+	    _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+	    ;;
+	rquotad)
+	    _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+	    ;;
+	lockd)
+	    _rpc_prog=nlockmgr
+	    _version=4
+	    _restart="echo 'Trying to restart lock manager service'"
+	    _restart="${_restart}; startstop_nfslock restart"
+	    ;;
+	statd)
+	    _rpc_prog=status
+	    _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+	    _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+	    _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+	    ;;
+	*)
+	    echo "Internal error: unknown RPC program \"$_prog_name\"."
+	    exit 1
+    esac
+
+    _service_name="nfs_${_prog_name}"
+
+    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+	ctdb_counter_init "$_service_name"
+	return 0
+    fi
+
+    ctdb_counter_incr "$_service_name"
+
+    while [ -n "$3" ] ; do
+	ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+	    for _action in $3 ; do
+		case "$_action" in
+		    verbose)
+			echo "$ctdb_check_rpc_out"
+			;;
+		    restart|restart:*)
+			# No explicit command specified, construct rpc command.
+			if [ -z "$_restart" ] ; then
+			    _p="rpc.${_prog_name}"
+			    _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+			    _restart="${_restart}; killall -q -9 $_p"
+			    _restart="${_restart}; $_p $_opts"
+			fi
+
+			# Process restart flags...
+			_flags="${_action#restart:}"
+			# There may not have been a colon...
+			[ "$_flags" != "$_action" ] || _flags=""
+			# q=quiet - everything to /dev/null
+			if [ "${_flags#*q}" != "$_flags" ] ; then
+			    _restart="{ ${_restart} ; } >/dev/null 2>&1"
+			fi
+			# s=stealthy - last command to /dev/null
+			if [ "${_flags#*s}" != "$_flags" ] ; then
+			    _restart="${_restart} >/dev/null 2>&1"
+			fi
+			# b=background - the whole thing, easy and reliable
+			if [ "${_flags#*b}" != "$_flags" ] ; then
+			    _restart="{ ${_restart} ; } &"
+			fi
+
+			# Do it!
+			eval "${_restart}"
+			;;
+		    unhealthy)
+			exit 1
+			;;
+		    *)
+			echo "Internal error: unknown action \"$_action\"."
+			exit 1
+		esac
+	    done
+
+	    # Only process the first action group.
+	    break
+	}
+	shift 3
+    done
+}
+
 ######################################################
 # check that a rpc server is registered with portmap
 # and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
 ######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
+{
     progname="$1"
-    prognum="$2"
-    version="$3"
+    version="$2"
 
-    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
-    if [ $? -ne 0 ] ; then
+    if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
 	ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 $ctdb_check_rpc_out"
 	echo "$ctdb_check_rpc_out"
@@ -355,8 +477,8 @@ startstop_nfs() {
 			service nfs start
 			;;
 		stop)
-			service nfs stop > /dev/null 2>&1
-			service nfslock stop > /dev/null 2>&1
+			service nfs stop
+			service nfslock stop
 			;;
 		restart)
 			set_proc "fs/nfsd/threads" 0
@@ -425,18 +547,6 @@ startstop_nfslock() {
 	esac
 }
 
-# better use delete_ip_from_iface() together with add_ip_to_iface
-# remove_ip should be removed in future
-remove_ip() {
-	local _ip_maskbits=$1
-	local _iface=$2
-	local _ip=`echo "$_ip_maskbits" | cut -d '/' -f1`
-	local _maskbits=`echo "$_ip_maskbits" | cut -d '/' -f2`
-
-	delete_ip_from_iface "$_iface" "$_ip" "$_maskbits"
-	return $?
-}
-
 add_ip_to_iface()
 {
 	local _iface=$1
@@ -515,23 +625,23 @@ setup_iface_ip_readd_script()
 # ctdb_check_counter_limit succeeds when count >= <limit>
 ########################################################
 _ctdb_counter_common () {
-    _service_name="${$1:-${service_name}}"
+    _service_name="${1:-${service_name}}"
     _counter_file="$ctdb_fail_dir/$_service_name"
     mkdir -p "${_counter_file%/*}" # dirname
 }
 ctdb_counter_init () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common "$1"
 
     >"$_counter_file"
 }
 ctdb_counter_incr () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common "$1"
 
     # unary counting!
     echo -n 1 >> "$_counter_file"
 }
 ctdb_check_counter_limit () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common
 
     _limit="${1:-${service_fail_limit}}"
     _quiet="$2"
@@ -546,7 +656,7 @@ ctdb_check_counter_limit () {
     fi
 }
 ctdb_check_counter_equal () {
-    _ctdb_counter_common "$@"
+    _ctdb_counter_common
 
     _limit=$1
 
@@ -557,6 +667,24 @@ ctdb_check_counter_equal () {
     fi
     return 0
 }
+ctdb_check_counter () {
+    _msg="${1:-error}"  # "error"  - anything else is silent on fail
+    _op="${2:--ge}"  # an integer operator supported by test
+    _limit="${3:-${service_fail_limit}}"
+    shift 3
+    _ctdb_counter_common "$1"
+
+    # unary counting!
+    _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+    if [ $_size $_op $_limit ] ; then
+	if [ "$_msg" = "error" ] ; then
+	    echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy"
+	    exit 1		
+	else
+	    return 1
+	fi
+    fi
+}
 
 ########################################################
 
@@ -639,35 +767,76 @@ ctdb_setstatus ()
     esac
 }
 
+##################################################################
+# Reconfigure a service on demand
+
+_ctdb_service_reconfigure_common ()
+{
+    _d="$ctdb_status_dir/${1:-${service_name}}"
+    mkdir -p "$_d"
+    _ctdb_service_reconfigure_flag="$_d/reconfigure"
+}
+
 ctdb_service_needs_reconfigure ()
 {
-    [ -e "$ctdb_status_dir/$service_name/reconfigure" ]
+    _ctdb_service_reconfigure_common "$@"
+    [ -e "$_ctdb_service_reconfigure_flag" ]
 }
 
 ctdb_service_set_reconfigure ()
 {
-    d="$ctdb_status_dir/$service_name"
-    mkdir -p "$d"
-    >"$d/reconfigure"
+    _ctdb_service_reconfigure_common "$@"
+    >"$_ctdb_service_reconfigure_flag"
 }
 
 ctdb_service_unset_reconfigure ()
 {
-    rm -f "$ctdb_status_dir/$service_name/reconfigure"
+    _ctdb_service_reconfigure_common "$@"
+    rm -f "$_ctdb_service_reconfigure_flag"
 }
 
 ctdb_service_reconfigure ()
 {
     echo "Reconfiguring service \"$service_name\"..."
-    if [ -n "$service_reconfigure" ] ; then
-	eval $service_reconfigure
-    else
-	service "$service_name" restart
+    ctdb_service_unset_reconfigure "$@"
+    service_reconfigure "$@" || return $?
+    ctdb_counter_init "$@"
+}
+
+# Default service_reconfigure() function.
+service_reconfigure ()
+{
+    service "${1:-$service_name}" restart
+}
+
+ctdb_service_check_reconfigure ()
+{
+    # Only do this for certain events.
+    case "$event_name" in
+	monitor|ipreallocated) : ;;
+	*) return 0
+    esac
+
+    if ctdb_service_needs_reconfigure "$@" ; then
+	ctdb_service_reconfigure "$@"
+
+	# Fall through to non-monitor events.
+	[ "$event_name" = "monitor" ] || return 0
+
+	# We don't want to proceed with the rest of the monitor event
+	# here, so we exit.  However, if we exit 0 then, if the
+	# service was previously broken, we might return a false
+	# positive.  So we simply retrieve the status of this script
+	# from the previous monitor loop and exit with that status.
+	ctdb scriptstatus | \
+	    grep -q -E "^${script_name}[[:space:]]+Status:OK[[:space:]]"
+	exit $?
     fi
-    ctdb_service_unset_reconfigure
-    ctdb_counter_init
 }
 
+##################################################################
+# Does CTDB manage this service? - and associated auto-start/stop
+
 ctdb_compat_managed_service ()
 {
     if [ "$1" = "yes" -a "$2" = "$_service_name" ] ; then
@@ -714,23 +883,24 @@ ctdb_start_stop_service ()
 
     if is_ctdb_managed_service "$_service_name" ; then
 	if ! is_ctdb_previously_managed_service "$_service_name" ; then
-	    echo "Starting service $_service_name"
-	    ctdb_service_start || exit $?
-	    ctdb_service_managed "$_service_name"
-	    exit 0
+	    echo "Starting service \"$_service_name\" - now managed"
+	    ctdb_service_start "$_service_name"
+	    exit $?
 	fi
     else
 	if is_ctdb_previously_managed_service "$_service_name" ; then
-	    echo "Stopping service $_service_name"
-	    ctdb_service_stop || exit $?
-	    ctdb_service_unmanaged "$_service_name"
-	    exit 0
+	    echo "Stopping service \"$_service_name\" - no longer managed"
+	    ctdb_service_stop "$_service_name"
+	    exit $?
 	fi
     fi
 }
 
 ctdb_service_start ()
 {
+    # The service is marked managed if we've ever tried to start it.
+    ctdb_service_managed "$@"
+
     # Here we only want $1.  If no argument is passed then
     # service_start needs to know.
     service_start "$@" || return $?
@@ -740,6 +910,7 @@ ctdb_service_start ()
 
 ctdb_service_stop ()
 {
+    ctdb_service_unmanaged "$@"
     service_stop "$@"
 }