cat "/proc/$1"
}
+######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a
+# fail count limit and an action string.
+#
+# For example:
+#
+# nfs_check_rpc_service "lockd" \
+# -ge 15 "verbose restart unhealthy" \
+# -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+ _prog_name="$1" ; shift
+
+ _version=1
+ _rpc_prog="$_prog_name"
+ _restart=""
+ _opts=""
+ case "$_prog_name" in
+ knfsd)
+ _rpc_prog=nfs
+ _version=3
+ _restart="echo 'Trying to restart NFS service'"
+ _restart="${_restart}; startstop_nfs restart"
+ ;;
+ mountd)
+ _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+ ;;
+ rquotad)
+ _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+ ;;
+ lockd)
+ _rpc_prog=nlockmgr
+ _version=4
+ _restart="echo 'Trying to restart lock manager service'"
+ _restart="${_restart}; startstop_nfslock restart"
+ ;;
+ statd)
+ _rpc_prog=status
+ _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+ _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+ _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+ ;;
+ *)
+ echo "Internal error: unknown RPC program \"$_prog_name\"."
+ exit 1
+ esac
+
+ _service_name="nfs_${_prog_name}"
+
+ if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+ ctdb_counter_init "$_service_name"
+ return 0
+ fi
+
+ ctdb_counter_incr "$_service_name"
+
+ while [ -n "$3" ] ; do
+ ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+ for _action in $3 ; do
+ case "$_action" in
+ verbose)
+ echo "$ctdb_check_rpc_out"
+ ;;
+ restart|restart:*)
+ # No explicit command specified, construct rpc command.
+ if [ -z "$_restart" ] ; then
+ _p="rpc.${_prog_name}"
+ _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+ _restart="${_restart}; killall -q -9 $_p"
+ _restart="${_restart}; $_p $_opts"
+ fi
+
+ # Process restart flags...
+ _flags="${_action#restart:}"
+ # There may not have been a colon...
+ [ "$_flags" != "$_action" ] || _flags=""
+ # q=quiet - everything to /dev/null
+ if [ "${_flags#*q}" != "$_flags" ] ; then
+ _restart="{ ${_restart} ; } >/dev/null 2>&1"
+ fi
+ # s=stealthy - last command to /dev/null
+ if [ "${_flags#*s}" != "$_flags" ] ; then
+ _restart="${_restart} >/dev/null 2>&1"
+ fi
+ # b=background - the whole thing, easy and reliable
+ if [ "${_flags#*b}" != "$_flags" ] ; then
+ _restart="{ ${_restart} ; } &"
+ fi
+
+ # Do it!
+ eval "${_restart}"
+ ;;
+ unhealthy)
+ exit 1
+ ;;
+ *)
+ echo "Internal error: unknown action \"$_action\"."
+ exit 1
+ esac
+ done
+
+ # Only process the first action group.
+ break
+ }
+ shift 3
+ done
+}
+
######################################################
# check that a rpc server is registered with portmap
# and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
+{
progname="$1"
- prognum="$2"
- version="$3"
+ version="$2"
- ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
- if [ $? -ne 0 ] ; then
+ if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
ctdb_check_rpc_out="ERROR: $progname failed RPC check:
$ctdb_check_rpc_out"
echo "$ctdb_check_rpc_out"
service nfs start
;;
stop)
- service nfs stop > /dev/null 2>&1
- service nfslock stop > /dev/null 2>&1
+ service nfs stop
+ service nfslock stop
;;
restart)
set_proc "fs/nfsd/threads" 0
esac
}
-# better use delete_ip_from_iface() together with add_ip_to_iface
-# remove_ip should be removed in future
-remove_ip() {
- local _ip_maskbits=$1
- local _iface=$2
- local _ip=`echo "$_ip_maskbits" | cut -d '/' -f1`
- local _maskbits=`echo "$_ip_maskbits" | cut -d '/' -f2`
-
- delete_ip_from_iface "$_iface" "$_ip" "$_maskbits"
- return $?
-}
-
add_ip_to_iface()
{
local _iface=$1
# ctdb_check_counter_limit succeeds when count >= <limit>
########################################################
_ctdb_counter_common () {
- _service_name="${$1:-${service_name}}"
+ _service_name="${1:-${service_name}}"
_counter_file="$ctdb_fail_dir/$_service_name"
mkdir -p "${_counter_file%/*}" # dirname
}
ctdb_counter_init () {
- _ctdb_counter_common "$@"
+ _ctdb_counter_common "$1"
>"$_counter_file"
}
ctdb_counter_incr () {
- _ctdb_counter_common "$@"
+ _ctdb_counter_common "$1"
# unary counting!
echo -n 1 >> "$_counter_file"
}
ctdb_check_counter_limit () {
- _ctdb_counter_common "$@"
+ _ctdb_counter_common
_limit="${1:-${service_fail_limit}}"
_quiet="$2"
fi
}
ctdb_check_counter_equal () {
- _ctdb_counter_common "$@"
+ _ctdb_counter_common
_limit=$1
fi
return 0
}
+ctdb_check_counter () {
+ _msg="${1:-error}" # "error" - anything else is silent on fail
+ _op="${2:--ge}" # an integer operator supported by test
+ _limit="${3:-${service_fail_limit}}"
+ shift 3
+ _ctdb_counter_common "$1"
+
+ # unary counting!
+ _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+ if [ $_size $_op $_limit ] ; then
+ if [ "$_msg" = "error" ] ; then
+ echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy"
+ exit 1
+ else
+ return 1
+ fi
+ fi
+}
########################################################
esac
}
+##################################################################
+# Reconfigure a service on demand
+
+_ctdb_service_reconfigure_common ()
+{
+ _d="$ctdb_status_dir/${1:-${service_name}}"
+ mkdir -p "$_d"
+ _ctdb_service_reconfigure_flag="$_d/reconfigure"
+}
+
ctdb_service_needs_reconfigure ()
{
- [ -e "$ctdb_status_dir/$service_name/reconfigure" ]
+ _ctdb_service_reconfigure_common "$@"
+ [ -e "$_ctdb_service_reconfigure_flag" ]
}
ctdb_service_set_reconfigure ()
{
- d="$ctdb_status_dir/$service_name"
- mkdir -p "$d"
- >"$d/reconfigure"
+ _ctdb_service_reconfigure_common "$@"
+ >"$_ctdb_service_reconfigure_flag"
}
ctdb_service_unset_reconfigure ()
{
- rm -f "$ctdb_status_dir/$service_name/reconfigure"
+ _ctdb_service_reconfigure_common "$@"
+ rm -f "$_ctdb_service_reconfigure_flag"
}
ctdb_service_reconfigure ()
{
echo "Reconfiguring service \"$service_name\"..."
- if [ -n "$service_reconfigure" ] ; then
- eval $service_reconfigure
- else
- service "$service_name" restart
+ ctdb_service_unset_reconfigure "$@"
+ service_reconfigure "$@" || return $?
+ ctdb_counter_init "$@"
+}
+
+# Default service_reconfigure() function.
+service_reconfigure ()
+{
+ service "${1:-$service_name}" restart
+}
+
+ctdb_service_check_reconfigure ()
+{
+ # Only do this for certain events.
+ case "$event_name" in
+ monitor|ipreallocated) : ;;
+ *) return 0
+ esac
+
+ if ctdb_service_needs_reconfigure "$@" ; then
+ ctdb_service_reconfigure "$@"
+
+ # Fall through to non-monitor events.
+ [ "$event_name" = "monitor" ] || return 0
+
+ # We don't want to proceed with the rest of the monitor event
+ # here, so we exit. However, if we exit 0 then, if the
+ # service was previously broken, we might return a false
+ # positive. So we simply retrieve the status of this script
+ # from the previous monitor loop and exit with that status.
+ ctdb scriptstatus | \
+ grep -q -E "^${script_name}[[:space:]]+Status:OK[[:space:]]"
+ exit $?
fi
- ctdb_service_unset_reconfigure
- ctdb_counter_init
}
+##################################################################
+# Does CTDB manage this service? - and associated auto-start/stop
+
ctdb_compat_managed_service ()
{
if [ "$1" = "yes" -a "$2" = "$_service_name" ] ; then
if is_ctdb_managed_service "$_service_name" ; then
if ! is_ctdb_previously_managed_service "$_service_name" ; then
- echo "Starting service $_service_name"
- ctdb_service_start || exit $?
- ctdb_service_managed "$_service_name"
- exit 0
+ echo "Starting service \"$_service_name\" - now managed"
+ ctdb_service_start "$_service_name"
+ exit $?
fi
else
if is_ctdb_previously_managed_service "$_service_name" ; then
- echo "Stopping service $_service_name"
- ctdb_service_stop || exit $?
- ctdb_service_unmanaged "$_service_name"
- exit 0
+ echo "Stopping service \"$_service_name\" - no longer managed"
+ ctdb_service_stop "$_service_name"
+ exit $?
fi
fi
}
ctdb_service_start ()
{
+ # The service is marked managed if we've ever tried to start it.
+ ctdb_service_managed "$@"
+
# Here we only want $1. If no argument is passed then
# service_start needs to know.
service_start "$@" || return $?
ctdb_service_stop ()
{
+ ctdb_service_unmanaged "$@"
service_stop "$@"
}