From 2cb04747887674def299e574fccb827c1c3194e7 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 28 Sep 2009 14:06:40 +1000 Subject: [PATCH] add a new eventscript 01.reclock if the reclock file has been set, then this script will test that the reclock file can actually be accessed. if the file does not exist, or if the attempts to stat the file hangs, the node will be marked unhealthy after the third failed monitoring event and after the tenth failure, ctdb itself will shutdown. --- Makefile.in | 1 + config/events.d/01.reclock | 58 ++++++++++++++++++++++++++++++++++++++ packaging/RPM/ctdb.spec | 1 + 3 files changed, 60 insertions(+) create mode 100755 config/events.d/01.reclock diff --git a/Makefile.in b/Makefile.in index cda8ec22..88f6eaef 100755 --- a/Makefile.in +++ b/Makefile.in @@ -207,6 +207,7 @@ install: all ${INSTALLCMD} -m 644 config/events.d/README $(DESTDIR)$(docdir)/ctdb/README.eventscripts ${INSTALLCMD} -m 644 doc/recovery-process.txt $(DESTDIR)$(docdir)/ctdb/recovery-process.txt ${INSTALLCMD} -m 755 config/events.d/00.ctdb $(DESTDIR)$(etcdir)/ctdb/events.d + ${INSTALLCMD} -m 755 config/events.d/01.reclock $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/10.interface $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/11.natgw $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/11.routing $(DESTDIR)$(etcdir)/ctdb/events.d diff --git a/config/events.d/01.reclock b/config/events.d/01.reclock new file mode 100755 index 00000000..7ca71edc --- /dev/null +++ b/config/events.d/01.reclock @@ -0,0 +1,58 @@ +#!/bin/sh +# script to check accessibility to the reclock file on a node + +. $CTDB_BASE/functions +loadconfig ctdb + +cmd="$1" +shift + +PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH + +# The size of this file represents the number of intervals that have +# passed when we have tried to but failed to stat the reclock file. +# after third failure the node becomes unhealthy +# after the tenth failure the node we shutdown ctdbd +RECLOCKCOUNT="$CTDB_BASE/state/reclock-fail-count" + +case $cmd in + startup) + echo -n > $RECLOCKCOUNT + ;; + + monitor) + echo -n 1 >> $RECLOCKCOUNT + + COUNT=`ls -ln $RECLOCKCOUNT | cut -d" " -f5` + [ $COUNT -gt 9 ] && { + echo "Reclock file can not be accessed. Shutting down." + sleep 1 + ctdb shutdown + } + + RECLOCKFILE=`ctdb -Y getreclock` + [ -z $RECLOCKFILE ] && { + # we are not using a reclock file + echo -n > $RECLOCKCOUNT + exit 0 + } + + # try stat the reclock file as a background process + # so that we dont block in case the cluster filesystem is unavailable + ( + stat $RECLOCKFILE + [ "$?" -eq 0 ] && { + # we could stat the file, reset the counter + echo -n > $RECLOCKCOUNT + } + ) >/dev/null 2>/dev/null & + + + [ $COUNT -gt 2 ] && { + echo "Reclock file can not be accessed. Mark node UNHEALTHY." + exit 1; + } + ;; +esac + +exit 0 diff --git a/packaging/RPM/ctdb.spec b/packaging/RPM/ctdb.spec index 7655e309..0954f74b 100644 --- a/packaging/RPM/ctdb.spec +++ b/packaging/RPM/ctdb.spec @@ -104,6 +104,7 @@ fi %{_docdir}/ctdb/README.eventscripts %{_docdir}/ctdb/recovery-process.txt %{_sysconfdir}/ctdb/events.d/00.ctdb +%{_sysconfdir}/ctdb/events.d/01.reclock %{_sysconfdir}/ctdb/events.d/10.interface %{_sysconfdir}/ctdb/events.d/11.natgw %{_sysconfdir}/ctdb/events.d/11.routing -- 2.34.1