ctdb-scripts: Improve error messages when using NFS service_check_cmd
[samba.git] / ctdb / config / events.d / 60.nfs
1 #!/bin/sh
2 # script to manage nfs in a clustered environment
3
4 [ -n "$CTDB_BASE" ] || \
5     export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
6
7 . $CTDB_BASE/functions
8
9 service_name="nfs"
10 loadconfig
11 ctdb_setup_service_state_dir
12
13 ######################################################################
14
15 if [ -z "$CTDB_NFS_CALLOUT" ] ; then
16     CTDB_NFS_CALLOUT="${CTDB_BASE}/nfs-linux-kernel-callout"
17 fi
18 # Always export, for statd callout
19 export CTDB_NFS_CALLOUT
20
21 # If the callout wants to use this then it must create it
22 export CTDB_NFS_CALLOUT_STATE_DIR="${service_state_dir}/callout-state"
23
24 nfs_callout_cache="${service_state_dir}/nfs_callout_cache"
25 nfs_callout_cache_callout="${nfs_callout_cache}/CTDB_NFS_CALLOUT"
26 nfs_callout_cache_ops="${nfs_callout_cache}/ops"
27
28 nfs_callout_register ()
29 {
30     mkdir -p "$nfs_callout_cache_ops"
31     rm -f "$nfs_callout_cache_ops"/*
32
33     echo "$CTDB_NFS_CALLOUT" >"$nfs_callout_cache_callout"
34
35     _t=$(eval "$CTDB_NFS_CALLOUT" "register")
36     if [ -n "$_t" ] ; then
37         echo "$_t" |
38             while IFS="" read _op ; do
39                 touch "${nfs_callout_cache_ops}/${_op}"
40             done
41     else
42         touch "${nfs_callout_cache_ops}/ALL"
43     fi
44 }
45
46 nfs_callout ()
47 {
48     # Re-run registration if $CTDB_NFS_CALLOUT has changed
49     _prev=""
50     if [ -r "$nfs_callout_cache_callout" ] ; then
51         read _prev <"$nfs_callout_cache_callout"
52     fi
53     if [ "$CTDB_NFS_CALLOUT" != "$_prev" ] ; then
54         nfs_callout_register
55     fi
56
57     # Run the operation if it is registered...
58     if [ -e "${nfs_callout_cache_ops}/${1}" ] || \
59            [ -e "${nfs_callout_cache_ops}/ALL" ]; then
60         eval "$CTDB_NFS_CALLOUT" "$@"
61     fi
62 }
63
64 service_reconfigure ()
65 {
66     # Restart lock manager, notify clients
67     if [ -x "${CTDB_BASE}/statd-callout" ] ; then
68         "${CTDB_BASE}/statd-callout" notify &
69     fi >/dev/null 2>&1
70 }
71
72 ######################################################################
73
74 ######################################################
75 # Check the health of NFS services
76 #
77 # Use .check files in given directory.
78 # Default is "${CTDB_BASE}/nfs-checks.d/"
79 ######################################################
80 nfs_check_services ()
81 {
82     _dir="${1:-${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}}"
83
84     # Files must end with .check - avoids editor backups, RPM fu, ...
85     for _f in "$_dir"/[0-9][0-9].*.check ; do
86         _t="${_f%.check}"
87         _progname="${_t##*/[0-9][0-9].}"
88
89         nfs_check_service "$_progname" <"$_f"
90     done
91 }
92
93 ######################################################
94 # Check the health of an NFS service
95 #
96 # $1 - progname, passed to rpcinfo (looked up in /etc/rpc)
97 #
98 # Reads variables from stdin
99 #
100 # Variables are:
101 #
102 # * family             - "tcp" or "udp" or space separated list
103 #                        default: tcp, not used with "service_check_cmd"
104 # * version            - optional, RPC service version number
105 #                        default is to omit to check for any version,
106 #                        not used with "service_check_cmd"
107 # * unhealthy_after    - number of check fails before unhealthy
108 #                        default: 1
109 # * restart_every      - number of check fails before restart
110 #                        default: 0, meaning no restart
111 # * service_stop_cmd   - command to stop service
112 #                        default: no default, must be provided if
113 #                                 restart_every > 0
114 # * service_start_cmd  - command to start service
115 #                        default: no default, must be provided if
116 #                                 restart_every > 0
117 # * service_check_cmd  - command to check health of service
118 #                        default is to check RPC service using rpcinfo
119 # * service_debug_cmd  - command to debug a service after trying to stop it;
120 #                        for example, it can be useful to print stack
121 #                        traces of threads that have not exited, since
122 #                        they may be stuck doing I/O;
123 #                        no default, see also function program_stack_traces()
124 #
125 # Quoting in values is not preserved
126 #
127 ######################################################
128 nfs_check_service ()
129 {
130     _progname="$1"
131
132     (
133         # Subshell to restrict scope variables...
134
135         # Defaults
136         family="tcp"
137         version=""
138         unhealthy_after=1
139         restart_every=0
140         service_stop_cmd=""
141         service_start_cmd=""
142         service_check_cmd=""
143         service_debug_cmd=""
144
145         # Eval line-by-line.  Expands variable references in values.
146         # Also allows variable name checking, which seems useful.
147         while read _line ; do
148             case "$_line" in
149                 \#*|"") : ;; # Ignore comments, blank lines
150
151                 family=*|version=*|\
152                 unhealthy_after=*|restart_every=*|\
153                 service_stop_cmd=*|service_start_cmd=*|\
154                 service_check_cmd=*|service_debug_cmd=*)
155
156                     eval "$_line"
157                     ;;
158                 *)
159                     echo "ERROR: Unknown variable for ${_progname}: ${_line}"
160                     exit 1
161             esac
162         done
163
164         _service_name="nfs_${_progname}"
165
166         _ok=false
167         if [ -n "$service_check_cmd" ] ; then
168             # Using eval means variables can contain semicolon separated commands
169             if eval "$service_check_cmd" ; then
170                 _ok=true
171             else
172                 _err="monitoring service \"${_progname}\" failed"
173             fi
174         else
175             if nfs_check_rpcinfo \
176                    "$_progname" "$version" "$family" >/dev/null ; then
177                 _ok=true
178             else
179                 _err="$ctdb_check_rpc_out"
180             fi
181         fi
182
183         if $_ok ; then
184             if [ $unhealthy_after -ne 1 -o $restart_every -ne 0 ] ; then
185                 ctdb_counter_init "$_service_name"
186             fi
187             exit 0
188         fi
189
190         ctdb_counter_incr "$_service_name"
191         _failcount=$(ctdb_counter_get "$_service_name")
192
193         _unhealthy=false
194         if [ $unhealthy_after -gt 0 ] ; then
195             if [ $_failcount -ge $unhealthy_after ] ; then
196                 _unhealthy=true
197                 echo "ERROR: $_err"
198             fi
199         fi
200
201         if [ $restart_every -gt 0 ] ; then
202             if [ $(($_failcount % $restart_every)) -eq 0 ] ; then
203                 if ! $_unhealthy ; then
204                     echo "WARNING: $_err"
205                 fi
206                 nfs_restart_service
207             fi
208         fi
209
210         if $_unhealthy ; then
211             exit 1
212         fi
213
214         return 0
215     ) || exit 1
216 }
217
218 # Uses: stop_service, start_service, debug_stuck_threads
219 nfs_restart_service ()
220 {
221     if [ -z "$service_stop_cmd" -o -z "$service_start_cmd" ] ; then
222         die "ERROR: Can not restart service \"${_progname}\" without corresponding service_start_cmd/service_stop_cmd settings"
223     fi
224
225     echo "Trying to restart service \"${_progname}\"..."
226     # Using eval means variables can contain semicolon separated commands
227     eval "$service_stop_cmd"
228     if [ -n "$service_debug_cmd" ] ; then
229         eval "$service_debug_cmd"
230     fi
231     background_with_logging eval "$service_start_cmd"
232 }
233
234 ######################################################
235 # Check an RPC service with rpcinfo
236 ######################################################
237 ctdb_check_rpc ()
238 {
239     _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
240     _version="$2"         # optional, not passed if empty/unset
241     _family="${3:-tcp}"   # optional, default is "tcp"
242
243     case "$_family" in
244         tcp6|udp6)
245             _localhost="${CTDB_RPCINFO_LOCALHOST6:-::1}"
246             ;;
247         *)
248             _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
249     esac
250
251     if ! ctdb_check_rpc_out=$(rpcinfo -T $_family $_localhost \
252                                       $_progname $_version 2>&1) ; then
253         ctdb_check_rpc_out="$_progname failed RPC check:
254 $ctdb_check_rpc_out"
255         echo "$ctdb_check_rpc_out"
256         return 1
257     fi
258 }
259
260 nfs_check_rpcinfo ()
261 {
262     _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
263     _versions="$2"        # optional, space separated, not passed if empty/unset
264     _families="${3:-tcp}" # optional, space separated, default is "tcp"
265
266     for _family in $_families ; do
267         if [ -n "$_versions" ] ; then
268             for _version in $_versions ; do
269                 ctdb_check_rpc $_progname $_version $_family || return $?
270             done
271         else
272             ctdb_check_rpc $_progname "" $_family || return $?
273         fi
274     done
275 }
276
277 ##################################################################
278 # use statd-callout to update NFS lock info
279 ##################################################################
280 nfs_update_lock_info ()
281 {
282     if [ -x "$CTDB_BASE/statd-callout" ] ; then
283         "$CTDB_BASE/statd-callout" update
284     fi
285 }
286
287 ######################################################################
288
289 ctdb_start_stop_service
290
291 is_ctdb_managed_service || exit 0
292
293 ctdb_service_check_reconfigure
294
295 case "$1" in
296     startup)
297         nfs_callout "$@"
298         ;;
299
300     shutdown)
301          nfs_callout "$@"
302         ;;
303
304     takeip)
305         nfs_callout "$@"
306         ctdb_service_set_reconfigure
307         ;;
308
309     releaseip)
310         nfs_callout "$@"
311         ctdb_service_set_reconfigure
312         ;;
313
314     monitor)
315         nfs_callout "monitor-pre" || exit $?
316
317         # Check that directories for shares actually exist
318         if [ "$CTDB_NFS_SKIP_SHARE_CHECK" != "yes" ] ; then
319             nfs_callout "monitor-list-shares" | ctdb_check_directories || \
320                 exit $?
321         fi
322
323         update_tickles 2049
324         nfs_update_lock_info
325
326         nfs_check_services
327
328         nfs_callout "monitor-post" || exit $?
329         ;;
330
331     *)
332         ctdb_standard_event_handler "$@"
333         ;;
334 esac
335
336 exit 0