X-Git-Url: http://git.samba.org/?a=blobdiff_plain;f=config%2Ffunctions;h=b04965281dbfcfe987dab89fab67a3ce5654f523;hb=380c9263eb37db5a250264316e250c2160908263;hp=452b8d0d976d5edb34497ae2810bfb3e9502f317;hpb=86e4aefed9fd1028660c98e3ea758c2b75ffc1d8;p=ctdb.git diff --git a/config/functions b/config/functions index 452b8d0d..b0496528 100755 --- a/config/functions +++ b/config/functions @@ -105,18 +105,140 @@ get_proc () cat "/proc/$1" } +###################################################### +# Check that an RPC service is healthy - +# this includes allowing a certain number of failures +# before marking the NFS service unhealthy. +# +# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...] +# +# each triple is a set of 3 arguments: an operator, a +# fail count limit and an action string. +# +# For example: +# +# nfs_check_rpc_service "lockd" \ +# -ge 15 "verbose restart unhealthy" \ +# -eq 10 "restart:bs" +# +# says that if lockd is down for 15 iterations then do +# a verbose restart of lockd and mark the node unhealthy. +# Before this, after 10 iterations of failure, the +# service is restarted silently in the background. +# Order is important: the number of failures need to be +# specified in reverse order because processing stops +# after the first condition that is true. +###################################################### +nfs_check_rpc_service () +{ + _prog_name="$1" ; shift + + _version=1 + _rpc_prog="$_prog_name" + _restart="" + _opts="" + case "$_prog_name" in + knfsd) + _rpc_prog=nfs + _version=3 + _restart="echo 'Trying to restart NFS service'" + _restart="${_restart}; startstop_nfs restart" + ;; + mountd) + _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" + ;; + rquotad) + _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" + ;; + lockd) + _rpc_prog=nlockmgr + _version=4 + _restart="echo 'Trying to restart lock manager service'" + _restart="${_restart}; startstop_nfslock restart" + ;; + statd) + _rpc_prog=status + _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" + _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}" + _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" + ;; + *) + echo "Internal error: unknown RPC program \"$_prog_name\"." + exit 1 + esac + + _service_name="nfs_${_prog_name}" + + if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then + ctdb_counter_init "$_service_name" + return 0 + fi + + ctdb_counter_incr "$_service_name" + + while [ -n "$3" ] ; do + ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || { + for _action in $3 ; do + case "$_action" in + verbose) + echo "$ctdb_check_rpc_out" + ;; + restart|restart:*) + # No explicit command specified, construct rpc command. + if [ -z "$_restart" ] ; then + _p="rpc.${_prog_name}" + _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'" + _restart="${_restart}; killall -q -9 $_p" + _restart="${_restart}; $_p $_opts" + fi + + # Process restart flags... + _flags="${_action#restart:}" + # There may not have been a colon... + [ "$_flags" != "$_action" ] || _flags="" + # q=quiet - everything to /dev/null + if [ "${_flags#*q}" != "$_flags" ] ; then + _restart="{ ${_restart} ; } >/dev/null 2>&1" + fi + # s=stealthy - last command to /dev/null + if [ "${_flags#*s}" != "$_flags" ] ; then + _restart="${_restart} >/dev/null 2>&1" + fi + # b=background - the whole thing, easy and reliable + if [ "${_flags#*b}" != "$_flags" ] ; then + _restart="{ ${_restart} ; } &" + fi + + # Do it! + eval "${_restart}" + ;; + unhealthy) + exit 1 + ;; + *) + echo "Internal error: unknown action \"$_action\"." + exit 1 + esac + done + + # Only process the first action group. + break + } + shift 3 + done +} + ###################################################### # check that a rpc server is registered with portmap # and responding to requests -# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION +# usage: ctdb_check_rpc SERVICE_NAME VERSION ###################################################### -ctdb_check_rpc() { +ctdb_check_rpc () +{ progname="$1" - prognum="$2" - version="$3" + version="$2" - ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1) - if [ $? -ne 0 ] ; then + if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then ctdb_check_rpc_out="ERROR: $progname failed RPC check: $ctdb_check_rpc_out" echo "$ctdb_check_rpc_out" @@ -355,8 +477,8 @@ startstop_nfs() { service nfs start ;; stop) - service nfs stop > /dev/null 2>&1 - service nfslock stop > /dev/null 2>&1 + service nfs stop + service nfslock stop ;; restart) set_proc "fs/nfsd/threads" 0 @@ -425,18 +547,6 @@ startstop_nfslock() { esac } -# better use delete_ip_from_iface() together with add_ip_to_iface -# remove_ip should be removed in future -remove_ip() { - local _ip_maskbits=$1 - local _iface=$2 - local _ip=`echo "$_ip_maskbits" | cut -d '/' -f1` - local _maskbits=`echo "$_ip_maskbits" | cut -d '/' -f2` - - delete_ip_from_iface "$_iface" "$_ip" "$_maskbits" - return $? -} - add_ip_to_iface() { local _iface=$1 @@ -515,23 +625,23 @@ setup_iface_ip_readd_script() # ctdb_check_counter_limit succeeds when count >= ######################################################## _ctdb_counter_common () { - _service_name="${$1:-${service_name}}" + _service_name="${1:-${service_name}}" _counter_file="$ctdb_fail_dir/$_service_name" mkdir -p "${_counter_file%/*}" # dirname } ctdb_counter_init () { - _ctdb_counter_common "$@" + _ctdb_counter_common "$1" >"$_counter_file" } ctdb_counter_incr () { - _ctdb_counter_common "$@" + _ctdb_counter_common "$1" # unary counting! echo -n 1 >> "$_counter_file" } ctdb_check_counter_limit () { - _ctdb_counter_common "$@" + _ctdb_counter_common _limit="${1:-${service_fail_limit}}" _quiet="$2" @@ -546,7 +656,7 @@ ctdb_check_counter_limit () { fi } ctdb_check_counter_equal () { - _ctdb_counter_common "$@" + _ctdb_counter_common _limit=$1 @@ -557,6 +667,24 @@ ctdb_check_counter_equal () { fi return 0 } +ctdb_check_counter () { + _msg="${1:-error}" # "error" - anything else is silent on fail + _op="${2:--ge}" # an integer operator supported by test + _limit="${3:-${service_fail_limit}}" + shift 3 + _ctdb_counter_common "$1" + + # unary counting! + _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0) + if [ $_size $_op $_limit ] ; then + if [ "$_msg" = "error" ] ; then + echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy" + exit 1 + else + return 1 + fi + fi +} ######################################################## @@ -639,35 +767,76 @@ ctdb_setstatus () esac } +################################################################## +# Reconfigure a service on demand + +_ctdb_service_reconfigure_common () +{ + _d="$ctdb_status_dir/${1:-${service_name}}" + mkdir -p "$_d" + _ctdb_service_reconfigure_flag="$_d/reconfigure" +} + ctdb_service_needs_reconfigure () { - [ -e "$ctdb_status_dir/$service_name/reconfigure" ] + _ctdb_service_reconfigure_common "$@" + [ -e "$_ctdb_service_reconfigure_flag" ] } ctdb_service_set_reconfigure () { - d="$ctdb_status_dir/$service_name" - mkdir -p "$d" - >"$d/reconfigure" + _ctdb_service_reconfigure_common "$@" + >"$_ctdb_service_reconfigure_flag" } ctdb_service_unset_reconfigure () { - rm -f "$ctdb_status_dir/$service_name/reconfigure" + _ctdb_service_reconfigure_common "$@" + rm -f "$_ctdb_service_reconfigure_flag" } ctdb_service_reconfigure () { echo "Reconfiguring service \"$service_name\"..." - if [ -n "$service_reconfigure" ] ; then - eval $service_reconfigure - else - service "$service_name" restart + ctdb_service_unset_reconfigure "$@" + service_reconfigure "$@" || return $? + ctdb_counter_init "$@" +} + +# Default service_reconfigure() function. +service_reconfigure () +{ + service "${1:-$service_name}" restart +} + +ctdb_service_check_reconfigure () +{ + # Only do this for certain events. + case "$event_name" in + monitor|ipreallocated) : ;; + *) return 0 + esac + + if ctdb_service_needs_reconfigure "$@" ; then + ctdb_service_reconfigure "$@" + + # Fall through to non-monitor events. + [ "$event_name" = "monitor" ] || return 0 + + # We don't want to proceed with the rest of the monitor event + # here, so we exit. However, if we exit 0 then, if the + # service was previously broken, we might return a false + # positive. So we simply retrieve the status of this script + # from the previous monitor loop and exit with that status. + ctdb scriptstatus | \ + grep -q -E "^${script_name}[[:space:]]+Status:OK[[:space:]]" + exit $? fi - ctdb_service_unset_reconfigure - ctdb_counter_init } +################################################################## +# Does CTDB manage this service? - and associated auto-start/stop + ctdb_compat_managed_service () { if [ "$1" = "yes" -a "$2" = "$_service_name" ] ; then @@ -714,23 +883,24 @@ ctdb_start_stop_service () if is_ctdb_managed_service "$_service_name" ; then if ! is_ctdb_previously_managed_service "$_service_name" ; then - echo "Starting service $_service_name" - ctdb_service_start || exit $? - ctdb_service_managed "$_service_name" - exit 0 + echo "Starting service \"$_service_name\" - now managed" + ctdb_service_start "$_service_name" + exit $? fi else if is_ctdb_previously_managed_service "$_service_name" ; then - echo "Stopping service $_service_name" - ctdb_service_stop || exit $? - ctdb_service_unmanaged "$_service_name" - exit 0 + echo "Stopping service \"$_service_name\" - no longer managed" + ctdb_service_stop "$_service_name" + exit $? fi fi } ctdb_service_start () { + # The service is marked managed if we've ever tried to start it. + ctdb_service_managed "$@" + # Here we only want $1. If no argument is passed then # service_start needs to know. service_start "$@" || return $? @@ -740,6 +910,7 @@ ctdb_service_start () ctdb_service_stop () { + ctdb_service_unmanaged "$@" service_stop "$@" }