config/functions: add tickle_tcp_connections()
[metze/ctdb/wip.git] / config / functions
1 # utility functions for ctdb event scripts
2
3 PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
4
5 #######################################
6 # pull in a system config file, if any
7 loadconfig() {
8
9     if [ "$1" != "ctdb" ] ; then
10         loadconfig "ctdb"
11     fi
12
13     if [ -z "$1" ] ; then
14         foo="${service_config:-${service_name}}"
15         if [ -n "$foo" ] ; then
16             loadconfig "$foo"
17         fi
18     fi
19
20     if [ -f /etc/sysconfig/$1 ]; then
21         . /etc/sysconfig/$1
22     elif [ -f /etc/default/$1 ]; then
23         . /etc/default/$1
24     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
25         . $CTDB_BASE/sysconfig/$1
26     fi
27 }
28
29 ##############################################################
30 # determine on what type of system (init style) we are running
31 detect_init_style() {
32     # only do detection if not already set:
33     test "x$CTDB_INIT_STYLE" != "x" && return
34
35     if [ -x /sbin/startproc ]; then
36         CTDB_INIT_STYLE="suse"
37     elif [ -x /sbin/start-stop-daemon ]; then
38         CTDB_INIT_STYLE="debian"
39     else
40         CTDB_INIT_STYLE="redhat"
41     fi
42 }
43
44 ######################################################
45 # simulate /sbin/service on platforms that don't have it
46 service() { 
47   _service_name="$1"
48   _op="$2"
49
50   # do nothing, when no service was specified
51   [ -z "$_service_name" ] && return
52
53   if [ -x /sbin/service ]; then
54       /sbin/service "$_service_name" "$_op"
55   elif [ -x /etc/init.d/$_service_name ]; then
56       /etc/init.d/$_service_name "$_op"
57   elif [ -x /etc/rc.d/init.d/$_service_name ]; then
58       /etc/rc.d/init.d/$_service_name "$_op"
59   fi
60 }
61
62 ######################################################
63 # simulate /sbin/service (niced) on platforms that don't have it
64 nice_service() { 
65   # do nothing, when no service was specified
66   [ -z "$1" ] && return
67
68     nice service "$@"
69 }
70
71 ######################################################
72 # wait for a command to return a zero exit status
73 # usage: ctdb_wait_command SERVICE_NAME <command>
74 ######################################################
75 ctdb_wait_command() {
76   service_name="$1"
77   wait_cmd="$2"
78   [ -z "$wait_cmd" ] && return;
79   all_ok=0
80   echo "Waiting for service $service_name to start"
81   while [ $all_ok -eq 0 ]; do
82           $wait_cmd > /dev/null 2>&1 && all_ok=1
83           ctdb status > /dev/null 2>&1 || {
84                 echo "ctdb daemon has died. Exiting wait for $service_name"
85                 exit 1
86           }
87           [ $all_ok -eq 1 ] || sleep 1
88   done
89   echo "Local service $service_name is up"
90 }
91
92
93 ######################################################
94 # wait for a set of tcp ports
95 # usage: ctdb_wait_tcp_ports SERVICE_NAME <ports...>
96 ######################################################
97 ctdb_wait_tcp_ports() {
98   service_name="$1"
99   shift
100   wait_ports="$*"
101   [ -z "$wait_ports" ] && return;
102   all_ok=0
103   echo "Waiting for tcp service $service_name to start"
104   while [ $all_ok -eq 0 ]; do
105           all_ok=1
106           for p in $wait_ports; do
107               if [ -x /usr/bin/netcat ]; then
108                   /usr/bin/netcat -z 127.0.0.1 $p > /dev/null || all_ok=0
109               elif [ -x /usr/bin/nc ]; then
110                   /usr/bin/nc -z 127.0.0.1 $p > /dev/null || all_ok=0
111               elif [ -x /usr/bin/netstat ]; then
112                   (netstat -a -n | egrep "0.0.0.0:$p[[:space:]]*LISTEN" > /dev/null) || all_ok=0
113               elif [ -x /bin/netstat ]; then
114                   (netstat -a -n | egrep "0.0.0.0:$p[[:space:]]*LISTEN" > /dev/null) || all_ok=0
115               else 
116                   echo "No tool to check tcp ports availabe. can not check in ctdb_wait_tcp_ports"
117                   return 127
118               fi
119           done
120           [ $all_ok -eq 1 ] || sleep 1
121           ctdb status > /dev/null 2>&1 || {
122                 echo "ctdb daemon has died. Exiting tcp wait $service_name"
123                 return 1
124           }
125   done
126   echo "Local tcp services for $service_name are up"
127 }
128
129
130 ######################################################
131 # check that a rpc server is registered with portmap
132 # and responding to requests
133 # usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
134 ######################################################
135 ctdb_check_rpc() {
136     progname="$1"
137     prognum="$2"
138     version="$3"
139     rpcinfo -u localhost $prognum $version > /dev/null || {
140             echo "ERROR: $progname not responding to rpc requests"
141             exit 1
142     }
143 }
144
145 ######################################################
146 # check a set of directories is available
147 # return 1 on a missing directory
148 # usage: ctdb_check_directories_probe SERVICE_NAME <directories...>
149 ######################################################
150 ctdb_check_directories_probe() {
151     while IFS="" read d ; do
152         case "$d" in
153             *%*)
154                 continue
155                 ;;
156             *)
157                 [ -d "$d" ] || return 1
158         esac
159     done
160 }
161
162 ######################################################
163 # check a set of directories is available
164 # usage: ctdb_check_directories SERVICE_NAME <directories...>
165 ######################################################
166 ctdb_check_directories() {
167     n="${1:-${service_name}}"
168     ctdb_check_directories_probe || {
169         echo "ERROR: $n directory \"$d\" not available"
170         exit 1
171     }
172 }
173
174 ######################################################
175 # check a set of tcp ports
176 # usage: ctdb_check_tcp_ports <ports...>
177 ######################################################
178 ctdb_check_tcp_ports() {
179
180     for p ; do
181         if ! netstat -a -t -n | grep -q "0\.0\.0\.0:$p .*LISTEN" ; then
182             if ! netstat -a -t -n | grep -q ":::$p .*LISTEN" ; then
183                 echo "ERROR: $service_name tcp port $p is not responding"
184                 return 1
185             fi
186         fi
187     done
188 }
189
190 ######################################################
191 # check a unix socket
192 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
193 ######################################################
194 ctdb_check_unix_socket() {
195     socket_path="$1"
196     [ -z "$socket_path" ] && return
197
198     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
199         echo "ERROR: $service_name socket $socket_path not found"
200         return 1
201     fi
202 }
203
204 ######################################################
205 # check a command returns zero status
206 # usage: ctdb_check_command SERVICE_NAME <command>
207 ######################################################
208 ctdb_check_command() {
209   service_name="$1"
210   wait_cmd="$2"
211   [ -z "$wait_cmd" ] && return;
212   $wait_cmd > /dev/null 2>&1 || {
213       echo "ERROR: $service_name - $wait_cmd returned error"
214       exit 1
215   }
216 }
217
218 ################################################
219 # kill off any TCP connections with the given IP
220 ################################################
221 kill_tcp_connections() {
222     _IP="$1"    
223     _failed=0
224
225     _killcount=0
226     connfile="$CTDB_BASE/state/connections.$_IP"
227     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
228     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
229
230     while read dest src; do
231         srcip=`echo $src | sed -e "s/:[^:]*$//"`
232         srcport=`echo $src | sed -e "s/^.*://"`
233         destip=`echo $dest | sed -e "s/:[^:]*$//"`
234         destport=`echo $dest | sed -e "s/^.*://"`
235         echo "Killing TCP connection $srcip:$srcport $destip:$destport"
236         ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1
237         case $destport in
238           # we only do one-way killtcp for CIFS
239           139|445) : ;;
240           # for all others we do 2-way
241           *) 
242                 ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1 || _failed=1
243                 ;;
244         esac
245         _killcount=`expr $_killcount + 1`
246      done < $connfile
247     /bin/rm -f $connfile
248
249     [ $_failed = 0 ] || {
250         echo "Failed to send killtcp control"
251         return;
252     }
253     [ $_killcount -gt 0 ] || {
254         return;
255     }
256     _count=0
257     while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do
258         sleep 1
259         _count=`expr $_count + 1`
260         [ $_count -gt 3 ] && {
261             echo "Timed out killing tcp connections for IP $_IP"
262             return;
263         }
264     done
265     echo "killed $_killcount TCP connections to released IP $_IP"
266 }
267
268 ##################################################################
269 # kill off the local end for any TCP connections with the given IP
270 ##################################################################
271 kill_tcp_connections_local_only() {
272     _IP="$1"    
273     _failed=0
274
275     _killcount=0
276     connfile="$CTDB_BASE/state/connections.$_IP"
277     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
278     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
279
280     while read dest src; do
281         srcip=`echo $src | sed -e "s/:[^:]*$//"`
282         srcport=`echo $src | sed -e "s/^.*://"`
283         destip=`echo $dest | sed -e "s/:[^:]*$//"`
284         destport=`echo $dest | sed -e "s/^.*://"`
285         echo "Killing TCP connection $srcip:$srcport $destip:$destport"
286         ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1
287         _killcount=`expr $_killcount + 1`
288      done < $connfile
289     /bin/rm -f $connfile
290
291     [ $_failed = 0 ] || {
292         echo "Failed to send killtcp control"
293         return;
294     }
295     [ $_killcount -gt 0 ] || {
296         return;
297     }
298     _count=0
299     while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do
300         sleep 1
301         _count=`expr $_count + 1`
302         [ $_count -gt 3 ] && {
303             echo "Timed out killing tcp connections for IP $_IP"
304             return;
305         }
306     done
307     echo "killed $_killcount TCP connections to released IP $_IP"
308 }
309
310 ##################################################################
311 # tickle any TCP connections with the given IP
312 ##################################################################
313 tickle_tcp_connections() {
314     _IP="$1"
315     _failed=0
316
317     _killcount=0
318     connfile="$CTDB_BASE/state/connections.$_IP"
319     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
320     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
321
322     while read dest src; do
323         srcip=`echo $src | sed -e "s/:[^:]*$//"`
324         srcport=`echo $src | sed -e "s/^.*://"`
325         destip=`echo $dest | sed -e "s/:[^:]*$//"`
326         destport=`echo $dest | sed -e "s/^.*://"`
327         echo "Tickle TCP connection $srcip:$srcport $destip:$destport"
328         ctdb tickle $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1
329         echo "Tickle TCP connection $destip:$destport $srcip:$srcport"
330         ctdb tickle $destip:$destport $srcip:$srcport >/dev/null 2>&1 || _failed=1
331      done < $connfile
332     /bin/rm -f $connfile
333
334     [ $_failed = 0 ] || {
335         echo "Failed to send tickle control"
336         return;
337     }
338 }
339
340 ########################################################
341 # start/stop the nfs service on different platforms
342 ########################################################
343 startstop_nfs() {
344         PLATFORM="unknown"
345         [ -x /etc/init.d/nfsserver ] && {
346                 PLATFORM="sles"
347         }
348         [ -x /etc/init.d/nfslock ] && {
349                 PLATFORM="rhel"
350         }
351
352         case $PLATFORM in
353         sles)
354                 case $1 in
355                 start)
356                         service nfsserver start
357                         ;;
358                 stop)
359                         service nfsserver stop > /dev/null 2>&1
360                         ;;
361                 esac
362                 ;;
363         rhel)
364                 case $1 in
365                 start)
366                         service nfslock start
367                         service nfs start
368                         ;;
369                 stop)
370                         service nfs stop > /dev/null 2>&1
371                         service nfslock stop > /dev/null 2>&1
372                         ;;
373                 esac
374                 ;;
375         *)
376                 echo "Unknown platform. NFS is not supported with ctdb"
377                 exit 1
378                 ;;
379         esac
380 }
381
382 ########################################################
383 # start/stop the nfs lockmanager service on different platforms
384 ########################################################
385 startstop_nfslock() {
386         PLATFORM="unknown"
387         [ -x /etc/init.d/nfsserver ] && {
388                 PLATFORM="sles"
389         }
390         [ -x /etc/init.d/nfslock ] && {
391                 PLATFORM="rhel"
392         }
393
394         case $PLATFORM in
395         sles)
396                 # for sles there is no service for lockmanager
397                 # so we instead just shutdown/restart nfs
398                 case $1 in
399                 start)
400                         service nfsserver start
401                         ;;
402                 stop)
403                         service nfsserver stop > /dev/null 2>&1
404                         ;;
405                 esac
406                 ;;
407         rhel)
408                 case $1 in
409                 start)
410                         service nfslock start
411                         ;;
412                 stop)
413                         service nfslock stop > /dev/null 2>&1
414                         ;;
415                 esac
416                 ;;
417         *)
418                 echo "Unknown platform. NFS locking is not supported with ctdb"
419                 exit 1
420                 ;;
421         esac
422 }
423
424 ########################################################
425 # remove an ip address from an interface
426 ########################################################
427 remove_ip() {
428         # the ip tool will delete all secondary IPs if this is the primary.
429         # To work around this _very_ annoying behaviour we have to keep a
430         # record of the secondaries and re-add them afterwards. yuck
431         secondaries=""
432         if ip addr list dev $2 primary | grep -q "inet $1 " ; then
433             secondaries=`ip addr list dev $2 secondary | grep " inet " | awk '{print $2}'`
434         fi
435         ip addr del $1 dev $2 >/dev/null 2>/dev/null || failed=1
436         [ -z "$secondaries" ] || {
437             for i in $secondaries; do
438                 if ip addr list dev $2 | grep -q "inet $i" ; then
439                     echo "kept secondary $i on dev $2"
440                 else 
441                     echo "re-adding secondary address $i to dev $2"
442                     ip addr add $i dev $2 || failed=1           
443                 fi
444             done
445         }
446 }
447
448 ########################################################
449 # some simple logic for counting events - per eventscript
450 # usage: ctdb_counter_init
451 #        ctdb_counter_incr
452 #        ctdb_check_counter_limit <limit>
453 # ctdb_check_counter_limit succeeds when count >= <limit>
454 ########################################################
455 _ctdb_counter_common () {
456     _counter_file="$ctdb_fail_dir/$service_name"
457     mkdir -p "${_counter_file%/*}" # dirname
458 }
459 ctdb_counter_init () {
460     _ctdb_counter_common
461
462     >"$_counter_file"
463 }
464 ctdb_counter_incr () {
465     _ctdb_counter_common
466
467     # unary counting!
468     echo -n 1 >> "$_counter_file"
469 }
470 ctdb_check_counter_limit () {
471     _ctdb_counter_common
472
473     _limit="${1:-${service_fail_limit}}"
474     _quiet="$2"
475
476     # unary counting!
477     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
478     if [ $_size -ge $_limit ] ; then
479         echo "ERROR: more than $_limit consecutive failures for $service_name, marking cluster unhealthy"
480         exit 1
481     elif [ $_size -gt 0 -a -z "$_quiet" ] ; then
482         echo "WARNING: less than $_limit consecutive failures ($_size) for $service_name, not unhealthy yet"
483     fi
484 }
485 ########################################################
486
487 ctdb_spool_dir="/var/spool/ctdb"
488 ctdb_status_dir="$ctdb_spool_dir/status"
489 ctdb_fail_dir="$ctdb_spool_dir/failcount"
490 ctdb_active_dir="$ctdb_spool_dir/active"
491
492 log_status_cat ()
493 {
494     echo "node is \"$1\", problem with \"${script_name}\": $(cat $2)"
495 }
496
497 ctdb_checkstatus ()
498 {
499     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
500         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
501         return 1
502     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
503         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
504         return 2
505     else
506         return 0
507     fi
508 }
509
510 ctdb_setstatus ()
511 {
512     d="$ctdb_status_dir/$script_name"
513     case "$1" in
514         unhealthy|banned)
515             mkdir -p "$d"
516             cat "$2" >"$d/$1"
517             ;;
518         *)
519             for i in "banned" "unhealthy" ; do
520                 rm -f "$d/$i"
521             done
522             ;;
523     esac
524 }
525
526 ctdb_service_needs_reconfigure ()
527 {
528     [ -e "$ctdb_status_dir/$service_name/reconfigure" ]
529 }
530
531 ctdb_service_set_reconfigure ()
532 {
533     d="$ctdb_status_dir/$service_name"
534     mkdir -p "$d"
535     >"$d/reconfigure"
536 }
537
538 ctdb_service_unset_reconfigure ()
539 {
540     rm -f "$ctdb_status_dir/$service_name/reconfigure"
541 }
542
543 ctdb_service_reconfigure ()
544 {
545     if [ -n "$service_reconfigure" ] ; then
546         eval $service_reconfigure
547     else
548         service "$service_name" restart
549     fi
550     ctdb_service_unset_reconfigure
551     ctdb_counter_init
552 }
553
554 ctdb_compat_managed_service ()
555 {
556     if [ "$1" = "yes" ] ; then
557         t="$t $2 "
558     fi
559 }
560
561 is_ctdb_managed_service ()
562 {
563     t=" $CTDB_MANAGED_SERVICES "
564
565     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
566     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
567     ctdb_compat_managed_service "$CTDB_MANAGES_SCP"      "scp"
568     ctdb_compat_managed_service "$CTDB_MANAGES_WINDBIND" "windbind"
569     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
570     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
571     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
572     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
573
574     # Returns 0 if "<space>$service_name<space>" appears in $t
575     [ "${t#* ${service_name} }" != "${t}" ]
576 }
577
578 ctdb_start_stop_service ()
579 {
580     _active="$ctdb_active_dir/$service_name"
581
582     if is_ctdb_managed_service ; then
583         if ! [ -e "$_active" ] ; then
584             echo "Starting service $service_name"
585             ctdb_service_start || exit $?
586             mkdir -p "$ctdb_active_dir"
587             touch "$_active"
588             exit 0
589         fi
590     elif ! is_ctdb_managed_service ; then
591         if [ -e "$_active" ] ; then
592             echo "Stopping service $service_name"
593             ctdb_service_stop || exit $?
594             rm -f "$_active"
595         fi
596         exit 0
597     fi
598 }
599
600 ctdb_service_start ()
601 {
602     if [ -n "$service_start" ] ; then
603         eval $service_start
604     else
605         service "$service_name" start
606     fi
607     ctdb_counter_init
608 }
609
610 ctdb_service_stop ()
611 {
612     if [ -n "$service_stop" ] ; then
613         eval $service_stop
614     else
615         service "$service_name" stop
616     fi
617 }
618
619 ctdb_standard_event_handler ()
620 {
621     case "$1" in
622         status)
623             ctdb_checkstatus
624             exit
625             ;;
626         setstatus)
627             ctdb_setstatus "$@"
628             exit
629             ;;
630     esac
631 }
632
633 ########################################################
634 # load a site local config file
635 ########################################################
636
637 [ -x $CTDB_BASE/rc.local ] && {
638         . $CTDB_BASE/rc.local
639 }
640
641 [ -d $CTDB_BASE/rc.local.d ] && {
642         for i in $CTDB_BASE/rc.local.d/* ; do
643                 [ -x "$i" ] && . "$i"
644         done
645 }
646
647 script_name="${0##*/}"       # basename
648 service_name="$script_name"  # default is just the script name
649 service_fail_limit=1