ctdb-daemon: Rename struct ctdb_scripts_wire to ctdb_script_list_old
[obnox/samba/samba-obnox.git] / ctdb / server / eventscript.c
index 722ebec6a9e475c9cecc9871da20ad27d39bb5c1..0286c610aeb5bc006a21ed006f0d1269df8dec32 100644 (file)
@@ -1,4 +1,4 @@
-/* 
+/*
    event script handling
 
    Copyright (C) Andrew Tridgell  2007
@@ -7,63 +7,67 @@
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.
-   
+
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-   
+
    You should have received a copy of the GNU General Public License
    along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "includes.h"
-#include <time.h>
+#include "replace.h"
 #include "system/filesys.h"
+#include "system/network.h"
 #include "system/wait.h"
 #include "system/dir.h"
 #include "system/locale.h"
-#include "../include/ctdb_private.h"
-#include "lib/tevent/tevent.h"
-#include "../common/rb_tree.h"
+#include "system/time.h"
 
-static void ctdb_event_script_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p);
+#include <talloc.h>
+#include <tevent.h>
 
-/*
-  ctdbd sends us a SIGTERM when we should die.
- */
-static void sigterm(int sig)
-{
-       /* all the child processes will be running in the same process group */
-       kill(-getpgrp(), SIGKILL);
-       _exit(1);
-}
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_logging.h"
+
+#include "common/rb_tree.h"
+#include "common/system.h"
+#include "common/common.h"
+
+
+static void ctdb_event_script_timeout(struct tevent_context *ev,
+                                     struct tevent_timer *te,
+                                     struct timeval t, void *p);
 
 /* This is attached to the event script state. */
 struct event_script_callback {
-       struct ctdb_event_script_state *state;
+       struct event_script_callback *next, *prev;
+       struct ctdb_context *ctdb;
 
        /* Warning: this can free us! */
        void (*fn)(struct ctdb_context *, int, void *);
        void *private_data;
 };
-       
 
 struct ctdb_event_script_state {
        struct ctdb_context *ctdb;
        struct event_script_callback *callback;
        pid_t child;
        int fd[2];
-       bool from_user;
        enum ctdb_eventscript_call call;
        const char *options;
        struct timeval timeout;
-       
+
        unsigned int current;
-       struct ctdb_scripts_wire *scripts;
+       struct ctdb_script_list_old *scripts;
 };
 
-static struct ctdb_script_wire *get_current_script(struct ctdb_event_script_state *state)
+static struct ctdb_script *get_current_script(struct ctdb_event_script_state *state)
 {
        return &state->scripts->scripts[state->current];
 }
@@ -75,7 +79,7 @@ static void log_event_script_output(const char *str, uint16_t len, void *p)
 {
        struct ctdb_event_script_state *state
                = talloc_get_type(p, struct ctdb_event_script_state);
-       struct ctdb_script_wire *current;
+       struct ctdb_script *current;
        unsigned int slen, min;
 
        /* We may have been aborted to run something else.  Discard */
@@ -110,10 +114,33 @@ int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb,
        return 0;
 }
 
-struct ctdb_script_tree_item {
-       const char *name;
-       int error;
-};
+/* To ignore directory entry return 0, else return non-zero */
+static int script_filter(const struct dirent *de)
+{
+       int namelen = strlen(de->d_name);
+
+       /* Ignore . and .. */
+       if (namelen < 3) {
+               return 0;
+       }
+
+       /* Skip temporary files left behind by emacs */
+       if (de->d_name[namelen-1] == '~') {
+               return 0;
+       }
+
+       /* Filename should start with [0-9][0-9]. */
+       if (!isdigit(de->d_name[0]) || !isdigit(de->d_name[1]) ||
+           de->d_name[2] != '.') {
+               return 0;
+       }
+
+       if (namelen > MAX_SCRIPT_NAME) {
+               return 0;
+       }
+
+       return 1;
+}
 
 /* Return true if OK, otherwise set errno. */
 static bool check_executable(const char *dir, const char *name)
@@ -143,236 +170,142 @@ static bool check_executable(const char *dir, const char *name)
        return true;
 }
 
-static struct ctdb_scripts_wire *ctdb_get_script_list(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
+static struct ctdb_script_list_old *ctdb_get_script_list(
+                                               struct ctdb_context *ctdb,
+                                               TALLOC_CTX *mem_ctx)
 {
-       DIR *dir;
-       struct dirent *de;
-       struct stat st;
-       trbt_tree_t *tree;
-       struct ctdb_scripts_wire *scripts;
-       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
-       struct ctdb_script_tree_item *tree_item;
-       int count;
-
-       /*
-         the service specific event scripts 
-       */
-       if (stat(ctdb->event_script_dir, &st) != 0 && 
-           errno == ENOENT) {
-               DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
-               talloc_free(tmp_ctx);
-               return NULL;
-       }
-
-       /* create a tree to store all the script names in */
-       tree = trbt_create(tmp_ctx, 0);
+       struct dirent **namelist;
+       struct ctdb_script_list_old *scripts;
+       int i, count;
 
-       /* scan all directory entries and insert all valid scripts into the 
+       /* scan all directory entries and insert all valid scripts into the
           tree
        */
-       dir = opendir(ctdb->event_script_dir);
-       if (dir == NULL) {
-               DEBUG(DEBUG_CRIT,("Failed to open event script directory '%s'\n", ctdb->event_script_dir));
-               talloc_free(tmp_ctx);
+       count = scandir(ctdb->event_script_dir, &namelist, script_filter, alphasort);
+       if (count == -1) {
+               DEBUG(DEBUG_CRIT, ("Failed to read event script directory '%s' - %s\n",
+                                  ctdb->event_script_dir, strerror(errno)));
                return NULL;
        }
 
-       count = 0;
-       while ((de=readdir(dir)) != NULL) {
-               int namlen;
-               unsigned num;
-
-               namlen = strlen(de->d_name);
-
-               if (namlen < 3) {
-                       continue;
-               }
-
-               if (de->d_name[namlen-1] == '~') {
-                       /* skip files emacs left behind */
-                       continue;
-               }
-
-               if (de->d_name[2] != '.') {
-                       continue;
-               }
-
-               if (sscanf(de->d_name, "%02u.", &num) != 1) {
-                       continue;
-               }
-
-               if (strlen(de->d_name) > MAX_SCRIPT_NAME) {
-                       DEBUG(DEBUG_ERR,("Script name %s too long! %u chars max",
-                                        de->d_name, MAX_SCRIPT_NAME));
-                       continue;
-               }
-
-               tree_item = talloc(tree, struct ctdb_script_tree_item);
-               if (tree_item == NULL) {
-                       DEBUG(DEBUG_ERR, (__location__ " Failed to allocate new tree item\n"));
-                       talloc_free(tmp_ctx);
-                       return NULL;
-               }
-       
-               tree_item->error = 0;
-               if (!check_executable(ctdb->event_script_dir, de->d_name)) {
-                       tree_item->error = errno;
-               }
-
-               tree_item->name = talloc_strdup(tree_item, de->d_name);
-               if (tree_item->name == NULL) {
-                       DEBUG(DEBUG_ERR,(__location__ " Failed to allocate script name.\n"));
-                       talloc_free(tmp_ctx);
-                       return NULL;
-               }
-
-               /* store the event script in the tree */
-               trbt_insert32(tree, (num<<16)|count++, tree_item);
-       }
-       closedir(dir);
-
        /* Overallocates by one, but that's OK */
-       scripts = talloc_zero_size(tmp_ctx,
+       scripts = talloc_zero_size(mem_ctx,
                                   sizeof(*scripts)
                                   + sizeof(scripts->scripts[0]) * count);
        if (scripts == NULL) {
                DEBUG(DEBUG_ERR, (__location__ " Failed to allocate scripts\n"));
-               talloc_free(tmp_ctx);
-               return NULL;
+               goto done;
        }
        scripts->num_scripts = count;
 
-       for (count = 0; count < scripts->num_scripts; count++) {
-               tree_item = trbt_findfirstarray32(tree, 1);
+       for (i = 0; i < count; i++) {
+               struct ctdb_script *s = &scripts->scripts[i];
 
-               strcpy(scripts->scripts[count].name, tree_item->name);
-               scripts->scripts[count].status = -tree_item->error;
+               if (strlcpy(s->name, namelist[i]->d_name, sizeof(s->name)) >=
+                   sizeof(s->name)) {
+                       s->status = -ENAMETOOLONG;
+                       continue;
+               }
 
-               /* remove this script from the tree */
-               talloc_free(tree_item);
+               s->status = 0;
+               if (!check_executable(ctdb->event_script_dir,
+                                     namelist[i]->d_name)) {
+                       s->status = -errno;
+               }
        }
 
-       talloc_steal(mem_ctx, scripts);
-       talloc_free(tmp_ctx);
+done:
+       for (i=0; i<count; i++) {
+               free(namelist[i]);
+       }
+       free(namelist);
        return scripts;
 }
 
-static int child_setup(struct ctdb_context *ctdb)
-{
-       if (setpgid(0,0) != 0) {
-               int ret = -errno;
-               DEBUG(DEBUG_ERR,("Failed to create process group for event scripts - %s\n",
-                        strerror(errno)));
-               return ret;
-       }
 
-       signal(SIGTERM, sigterm);
-       return 0;
-}
+/* There cannot be more than 10 arguments to command helper. */
+#define MAX_HELPER_ARGS                (10)
 
-static char *child_command_string(struct ctdb_context *ctdb,
-                                      TALLOC_CTX *ctx,
-                                      bool from_user,
-                                      const char *scriptname,
-                                      enum ctdb_eventscript_call call,
-                                      const char *options)
+static bool child_helper_args(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
+                             enum ctdb_eventscript_call call,
+                             const char *options,
+                             struct ctdb_script *current, int fd,
+                             int *argc, const char ***argv)
 {
-       const char *str = from_user ? "CTDB_CALLED_BY_USER=1 " : "";
-
-       /* Allow a setting where we run the actual monitor event
-          from an external source and replace it with
-          a "status" event that just picks up the actual
-          status of the event asynchronously.
-       */
-       if ((ctdb->tunable.use_status_events_for_monitoring != 0)
-           &&  (call == CTDB_EVENT_MONITOR)
-           &&  !from_user) {
-               return talloc_asprintf(ctx, "%s%s/%s %s",
-                                      str,
-                                      ctdb->event_script_dir,
-                                      scriptname, "status");
-       } else {
-               return talloc_asprintf(ctx, "%s%s/%s %s %s",
-                                      str,
-                                      ctdb->event_script_dir,
-                                      scriptname,
-                                      ctdb_eventscript_call_names[call],
-                                      options);
+       const char **tmp;
+       int n, i;
+       char *t, *saveptr, *opt;
+
+       tmp = talloc_array(mem_ctx, const char *, 10+1);
+       if (tmp == NULL)  goto failed;
+
+       tmp[0] = talloc_asprintf(tmp, "%d", fd);
+       tmp[1] = talloc_asprintf(tmp, "%s/%s", ctdb->event_script_dir, current->name);
+       tmp[2] = talloc_asprintf(tmp, "%s", ctdb_eventscript_call_names[call]);
+       n = 3;
+
+       /* Split options into individual arguments */
+       opt = talloc_strdup(mem_ctx, options);
+       if (opt == NULL) {
+               goto failed;
        }
-}
 
-static int child_run_one(struct ctdb_context *ctdb,
-                        const char *scriptname, const char *cmdstr)
-{
-       int ret;
-
-       ret = system(cmdstr);
-       /* if the system() call was successful, translate ret into the
-          return code from the command
-       */
-       if (ret != -1) {
-               ret = WEXITSTATUS(ret);
-       } else {
-               ret = -errno;
+       t = strtok_r(opt, " ", &saveptr);
+       while (t != NULL) {
+               tmp[n++] = talloc_strdup(tmp, t);
+               if (n > MAX_HELPER_ARGS) {
+                       goto args_failed;
+               }
+               t = strtok_r(NULL, " ", &saveptr);
        }
 
-       /* 127 could mean it does not exist, 126 non-executable. */
-       if (ret == 127 || ret == 126) {
-               /* Re-check it... */
-               if (!check_executable(ctdb->event_script_dir, scriptname)) {
-                       DEBUG(DEBUG_ERR,("Script %s returned status %u. Someone just deleted it?\n",
-                                        cmdstr, ret));
-                       ret = -errno;
+       for (i=0; i<n; i++) {
+               if (tmp[i] == NULL) {
+                       goto failed;
                }
        }
-       return ret;
-}
 
-/*
-  Actually run one event script
-  this function is called and run in the context of a forked child
-  which allows it to do blocking calls such as system()
- */
-static int child_run_script(struct ctdb_context *ctdb,
-                           bool from_user,
-                           enum ctdb_eventscript_call call,
-                           const char *options,
-                           struct ctdb_script_wire *current)
-{
-       char *cmdstr;
-       int ret;
-       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       /* Last argument should be NULL */
+       tmp[n++] = NULL;
 
-       ret = child_setup(ctdb);
-       if (ret != 0)
-               goto out;
+       *argc = n;
+       *argv = tmp;
+       return true;
 
-       cmdstr = child_command_string(ctdb, tmp_ctx, from_user,
-                                     current->name, call, options);
-       CTDB_NO_MEMORY(ctdb, cmdstr);
 
-       DEBUG(DEBUG_DEBUG,("Executing event script %s\n",cmdstr));
+args_failed:
+       DEBUG(DEBUG_ERR, (__location__ " too many arguments '%s' to eventscript '%s'\n",
+                         options, ctdb_eventscript_call_names[call]));
 
-       if (current->status) {
-               ret = current->status;
-               goto out;
+failed:
+       if (tmp) {
+               talloc_free(tmp);
        }
+       return false;
 
-       ret = child_run_one(ctdb, current->name, cmdstr);
-out:
-       talloc_free(tmp_ctx);
-       return ret;
 }
 
-static void ctdb_event_script_handler(struct event_context *ev, struct fd_event *fde,
+static void ctdb_event_script_handler(struct tevent_context *ev,
+                                     struct tevent_fd *fde,
                                      uint16_t flags, void *p);
 
+static char helper_prog[PATH_MAX+1] = "";
+
 static int fork_child_for_script(struct ctdb_context *ctdb,
                                 struct ctdb_event_script_state *state)
 {
        int r;
        struct tevent_fd *fde;
-       struct ctdb_script_wire *current = get_current_script(state);
+       struct ctdb_script *current = get_current_script(state);
+       int argc;
+       const char **argv;
+
+       if (!ctdb_set_helper("event helper", helper_prog, sizeof(helper_prog),
+                            "CTDB_EVENT_HELPER",
+                            CTDB_HELPER_BINDIR, "ctdb_event_helper")) {
+               ctdb_die(ctdb, __location__
+                        " Unable to set event helper\n");
+       }
 
        current->start = timeval_current();
 
@@ -382,40 +315,35 @@ static int fork_child_for_script(struct ctdb_context *ctdb,
                return -errno;
        }
 
-       if (!ctdb_fork_with_logging(state, ctdb, current->name, log_event_script_output,
-                                   state, &state->child)) {
-               r = -errno;
+       /* Arguments for helper */
+       if (!child_helper_args(state, ctdb, state->call, state->options, current,
+                              state->fd[1], &argc, &argv)) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to create arguments for eventscript helper\n"));
+               r = -ENOMEM;
                close(state->fd[0]);
                close(state->fd[1]);
                return r;
        }
 
-       /* If we are the child, do the work. */
-       if (state->child == 0) {
-               int rt;
-
-               debug_extra = talloc_asprintf(NULL, "eventscript-%s-%s:",
-                                             current->name,
-                                             ctdb_eventscript_call_names[state->call]);
+       if (!ctdb_vfork_with_logging(state, ctdb, current->name,
+                                    helper_prog, argc, argv,
+                                    log_event_script_output,
+                                    state, &state->child)) {
+               talloc_free(argv);
+               r = -errno;
                close(state->fd[0]);
-               set_close_on_exec(state->fd[1]);
-
-               rt = child_run_script(ctdb, state->from_user, state->call, state->options, current);
-               /* We must be able to write PIPEBUF bytes at least; if this
-                  somehow fails, the read above will be short. */
-               write(state->fd[1], &rt, sizeof(rt));
                close(state->fd[1]);
-               _exit(rt);
+               return r;
        }
 
+       talloc_free(argv);
+
        close(state->fd[1]);
        set_close_on_exec(state->fd[0]);
 
-       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to child eventscript process\n", state->fd[0]));
-
        /* Set ourselves up to be called when that's done. */
-       fde = event_add_fd(ctdb->ev, state, state->fd[0], EVENT_FD_READ,
-                          ctdb_event_script_handler, state);
+       fde = tevent_add_fd(ctdb->ev, state, state->fd[0], TEVENT_FD_READ,
+                           ctdb_event_script_handler, state);
        tevent_fd_set_auto_close(fde);
 
        return 0;
@@ -424,12 +352,13 @@ static int fork_child_for_script(struct ctdb_context *ctdb,
 /*
  Summarize status of this run of scripts.
  */
-static int script_status(struct ctdb_scripts_wire *scripts)
+static int script_status(struct ctdb_script_list_old *scripts)
 {
        unsigned int i;
 
        for (i = 0; i < scripts->num_scripts; i++) {
                switch (scripts->scripts[i].status) {
+               case -ENAMETOOLONG:
                case -ENOENT:
                case -ENOEXEC:
                        /* Disabled or missing; that's OK. */
@@ -447,12 +376,13 @@ static int script_status(struct ctdb_scripts_wire *scripts)
 }
 
 /* called when child is finished */
-static void ctdb_event_script_handler(struct event_context *ev, struct fd_event *fde, 
+static void ctdb_event_script_handler(struct tevent_context *ev,
+                                     struct tevent_fd *fde,
                                      uint16_t flags, void *p)
 {
-       struct ctdb_event_script_state *state = 
+       struct ctdb_event_script_state *state =
                talloc_get_type(p, struct ctdb_event_script_state);
-       struct ctdb_script_wire *current = get_current_script(state);
+       struct ctdb_script *current = get_current_script(state);
        struct ctdb_context *ctdb = state->ctdb;
        int r, status;
 
@@ -461,9 +391,11 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
                return;
        }
 
-       r = read(state->fd[0], &current->status, sizeof(current->status));
+       r = sys_read(state->fd[0], &current->status, sizeof(current->status));
        if (r < 0) {
                current->status = -errno;
+       } else if (r == 0) {
+               current->status = -EINTR;
        } else if (r != sizeof(current->status)) {
                current->status = -EIO;
        }
@@ -472,7 +404,7 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
        /* valgrind gets overloaded if we run next script as it's still doing
         * post-execution analysis, so kill finished child here. */
        if (ctdb->valgrinding) {
-               kill(state->child, SIGKILL);
+               ctdb_kill(ctdb, state->child, SIGKILL);
        }
 
        state->child = 0;
@@ -481,8 +413,12 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
 
        /* Aborted or finished all scripts?  We're done. */
        if (status != 0 || state->current+1 == state->scripts->num_scripts) {
-               DEBUG(DEBUG_INFO,(__location__ " Eventscript %s %s finished with state %d\n",
-                                 ctdb_eventscript_call_names[state->call], state->options, status));
+               if (status != 0) {
+                       DEBUG(DEBUG_INFO,
+                             ("Eventscript %s %s finished with state %d\n",
+                              ctdb_eventscript_call_names[state->call],
+                              state->options, status));
+               }
 
                ctdb->event_script_timeouts = 0;
                talloc_free(state);
@@ -502,55 +438,119 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
        }
 }
 
-static void debug_timeout(struct ctdb_event_script_state *state)
+struct debug_hung_script_state {
+       struct ctdb_context *ctdb;
+       pid_t child;
+       enum ctdb_eventscript_call call;
+};
+
+static int debug_hung_script_state_destructor(struct debug_hung_script_state *state)
+{
+       if (state->child) {
+               ctdb_kill(state->ctdb, state->child, SIGKILL);
+       }
+       return 0;
+}
+
+static void debug_hung_script_timeout(struct tevent_context *ev, struct tevent_timer *te,
+                                     struct timeval t, void *p)
+{
+       struct debug_hung_script_state *state =
+               talloc_get_type(p, struct debug_hung_script_state);
+
+       talloc_free(state);
+}
+
+static void debug_hung_script_done(struct tevent_context *ev, struct tevent_fd *fde,
+                                  uint16_t flags, void *p)
+{
+       struct debug_hung_script_state *state =
+               talloc_get_type(p, struct debug_hung_script_state);
+
+       talloc_free(state);
+}
+
+static void ctdb_run_debug_hung_script(struct ctdb_context *ctdb, struct debug_hung_script_state *state)
 {
-       struct ctdb_script_wire *current = get_current_script(state);
-       char *cmd;
        pid_t pid;
-       time_t t;
-       char tbuf[100], buf[200];
-
-       cmd = child_command_string(state->ctdb, state,
-                                  state->from_user, current->name,
-                                  state->call, state->options);
-       CTDB_NO_MEMORY_VOID(state->ctdb, cmd);
-
-       DEBUG(DEBUG_ERR,("Timed out running script '%s' after %.1f seconds pid :%d\n",
-                        cmd, timeval_elapsed(&current->start), state->child));
-       talloc_free(cmd);
-
-       t = time(NULL);
-       strftime(tbuf, sizeof(tbuf)-1, "%Y%m%d%H%M%S",  localtime(&t));
-       sprintf(buf, "{ pstree -p; cat /proc/locks; ls -li /var/ctdb/ /var/ctdb/persistent; }"
-                       " >/tmp/ctdb.event.%s.%d", tbuf, getpid());
-
-       pid = ctdb_fork(state->ctdb);
-       if (pid == 0) {
-               system(buf);
-               /* Now we can kill the child */
-               kill(state->child, SIGTERM);
-               exit(0);
-       }
-       if (pid == -1) {
-               DEBUG(DEBUG_ERR,("Fork for debug script failed : %s\n",
-                                strerror(errno)));
-       } else {
-               DEBUG(DEBUG_ERR,("Logged timedout eventscript : %s\n", buf));
-               /* Don't kill child until timeout done. */
-               state->child = 0;
+       const char * debug_hung_script = CTDB_ETCDIR "/debug-hung-script.sh";
+       int fd[2];
+       struct tevent_timer *ttimer;
+       struct tevent_fd *tfd;
+       const char **argv;
+       int i;
+
+       if (pipe(fd) < 0) {
+               DEBUG(DEBUG_ERR,("Failed to create pipe fd for debug hung script\n"));
+               return;
+       }
+
+       if (getenv("CTDB_DEBUG_HUNG_SCRIPT") != NULL) {
+               debug_hung_script = getenv("CTDB_DEBUG_HUNG_SCRIPT");
+       }
+
+       argv = talloc_array(state, const char *, 5);
+
+       argv[0] = talloc_asprintf(argv, "%d", fd[1]);
+       argv[1] = talloc_strdup(argv, debug_hung_script);
+       argv[2] = talloc_asprintf(argv, "%d", state->child);
+       argv[3] = talloc_strdup(argv, ctdb_eventscript_call_names[state->call]);
+       argv[4] = NULL;
+
+       for (i=0; i<4; i++) {
+               if (argv[i] == NULL) {
+                       close(fd[0]);
+                       close(fd[1]);
+                       talloc_free(argv);
+                       return;
+               }
        }
+
+
+       if (!ctdb_vfork_with_logging(state, ctdb, "Hung-script",
+                                    helper_prog, 5, argv, NULL, NULL, &pid)) {
+               DEBUG(DEBUG_ERR,("Failed to fork a child to track hung event script\n"));
+               talloc_free(argv);
+               close(fd[0]);
+               close(fd[1]);
+               return;
+       }
+
+       talloc_free(argv);
+       close(fd[1]);
+
+       ttimer = tevent_add_timer(ctdb->ev, state,
+                                 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
+                                 debug_hung_script_timeout, state);
+       if (ttimer == NULL) {
+               close(fd[0]);
+               return;
+       }
+
+       tfd = tevent_add_fd(ctdb->ev, state, fd[0], TEVENT_FD_READ,
+                           debug_hung_script_done, state);
+       if (tfd == NULL) {
+               talloc_free(ttimer);
+               close(fd[0]);
+               return;
+       }
+       tevent_fd_set_auto_close(tfd);
 }
 
 /* called when child times out */
-static void ctdb_event_script_timeout(struct event_context *ev, struct timed_event *te, 
+static void ctdb_event_script_timeout(struct tevent_context *ev,
+                                     struct tevent_timer *te,
                                      struct timeval t, void *p)
 {
        struct ctdb_event_script_state *state = talloc_get_type(p, struct ctdb_event_script_state);
        struct ctdb_context *ctdb = state->ctdb;
-       struct ctdb_script_wire *current = get_current_script(state);
+       struct ctdb_script *current = get_current_script(state);
+       struct debug_hung_script_state *debug_state;
 
-       DEBUG(DEBUG_ERR,("Event script timed out : %s %s %s count : %u  pid : %d\n",
-                        current->name, ctdb_eventscript_call_names[state->call], state->options, ctdb->event_script_timeouts, state->child));
+       DEBUG(DEBUG_ERR,("Event script '%s %s %s' timed out after %.1fs, count: %u, pid: %d\n",
+                        current->name, ctdb_eventscript_call_names[state->call], state->options,
+                        timeval_elapsed(&current->start),
+                        ctdb->event_script_timeouts, state->child));
 
        /* ignore timeouts for these events */
        switch (state->call) {
@@ -558,18 +558,33 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
        case CTDB_EVENT_RECOVERED:
        case CTDB_EVENT_TAKE_IP:
        case CTDB_EVENT_RELEASE_IP:
-       case CTDB_EVENT_STOPPED:
-       case CTDB_EVENT_MONITOR:
-       case CTDB_EVENT_STATUS:
                state->scripts->scripts[state->current].status = 0;
                DEBUG(DEBUG_ERR,("Ignoring hung script for %s call %d\n", state->options, state->call));
                break;
         default:
                state->scripts->scripts[state->current].status = -ETIME;
-               debug_timeout(state);
        }
 
+       debug_state = talloc_zero(ctdb, struct debug_hung_script_state);
+       if (debug_state == NULL) {
+               talloc_free(state);
+               return;
+       }
+
+       /* Save information useful for running debug hung script, so
+        * eventscript state can be freed.
+        */
+       debug_state->ctdb = ctdb;
+       debug_state->child = state->child;
+       debug_state->call = state->call;
+
+       /* This destructor will actually kill the hung event script */
+       talloc_set_destructor(debug_state, debug_hung_script_state_destructor);
+
+       state->child = 0;
        talloc_free(state);
+
+       ctdb_run_debug_hung_script(ctdb, debug_state);
 }
 
 /*
@@ -583,7 +598,7 @@ static int event_script_destructor(struct ctdb_event_script_state *state)
        if (state->child) {
                DEBUG(DEBUG_ERR,(__location__ " Sending SIGTERM to child pid:%d\n", state->child));
 
-               if (kill(state->child, SIGTERM) != 0) {
+               if (ctdb_kill(state->ctdb, state->child, SIGTERM) != 0) {
                        DEBUG(DEBUG_ERR,("Failed to kill child process for eventscript, errno %s(%d)\n", strerror(errno), errno));
                }
        }
@@ -610,9 +625,24 @@ static int event_script_destructor(struct ctdb_event_script_state *state)
                status = 0;
        }
 
+       state->ctdb->active_events--;
+       if (state->ctdb->active_events < 0) {
+               ctdb_fatal(state->ctdb, "Active events < 0");
+       }
+
        /* This is allowed to free us; talloc will prevent double free anyway,
-        * but beware if you call this outside the destructor! */
-       callback = state->callback;
+        * but beware if you call this outside the destructor!
+        * the callback hangs off a different context so we walk the list
+        * of "active" callbacks until we find the one state points to.
+        * if we cant find it it means the callback has been removed.
+        */
+       for (callback = state->ctdb->script_callbacks; callback != NULL; callback = callback->next) {
+               if (callback == state->callback) {
+                       break;
+               }
+       }
+
+       state->callback = NULL;
 
        if (callback) {
                /* Make sure destructor doesn't free itself! */
@@ -646,11 +676,8 @@ static bool check_options(enum ctdb_eventscript_call call, const char *options)
        case CTDB_EVENT_STARTUP:
        case CTDB_EVENT_START_RECOVERY:
        case CTDB_EVENT_RECOVERED:
-       case CTDB_EVENT_STOPPED:
        case CTDB_EVENT_MONITOR:
-       case CTDB_EVENT_STATUS:
        case CTDB_EVENT_SHUTDOWN:
-       case CTDB_EVENT_RELOAD:
        case CTDB_EVENT_IPREALLOCATED:
                return count_words(options) == 0;
 
@@ -669,54 +696,23 @@ static bool check_options(enum ctdb_eventscript_call call, const char *options)
 
 static int remove_callback(struct event_script_callback *callback)
 {
-       /* Detach ourselves from the running script state */
-       callback->state->callback = NULL;
+       DLIST_REMOVE(callback->ctdb->script_callbacks, callback);
        return 0;
 }
 
 /*
-  run the event script in the background, calling the callback when 
+  run the event script in the background, calling the callback when
   finished
  */
 static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
                                        const void *mem_ctx,
                                        void (*callback)(struct ctdb_context *, int, void *),
                                        void *private_data,
-                                       bool from_user,
                                        enum ctdb_eventscript_call call,
                                        const char *fmt, va_list ap)
 {
        struct ctdb_event_script_state *state;
 
-       state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state);
-       CTDB_NO_MEMORY(ctdb, state);
-
-       /* The callback isn't done if the context is freed. */
-       state->callback = talloc(mem_ctx, struct event_script_callback);
-       CTDB_NO_MEMORY(ctdb, state->callback);
-       talloc_set_destructor(state->callback, remove_callback);
-       state->callback->state = state;
-       state->callback->fn = callback;
-       state->callback->private_data = private_data;
-
-       state->ctdb = ctdb;
-       state->from_user = from_user;
-       state->call = call;
-       state->options = talloc_vasprintf(state, fmt, ap);
-       state->timeout = timeval_set(ctdb->tunable.script_timeout, 0);
-       state->scripts = NULL;
-       if (state->options == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n"));
-               talloc_free(state);
-               return -1;
-       }
-       if (!check_options(state->call, state->options)) {
-               DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for %s\n",
-                                 ctdb_eventscript_call_names[state->call], state->options));
-               talloc_free(state);
-               return -1;
-       }
-
        if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
                /* we guarantee that only some specifically allowed event scripts are run
                   while in recovery */
@@ -726,7 +722,7 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
                        CTDB_EVENT_START_RECOVERY,
                        CTDB_EVENT_SHUTDOWN,
                        CTDB_EVENT_RELEASE_IP,
-                       CTDB_EVENT_STOPPED
+                       CTDB_EVENT_IPREALLOCATED,
                };
                int i;
                for (i=0;i<ARRAY_SIZE(allowed_calls);i++) {
@@ -735,17 +731,29 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
                if (i == ARRAY_SIZE(allowed_calls)) {
                        DEBUG(DEBUG_ERR,("Refusing to run event scripts call '%s' while in recovery\n",
                                 ctdb_eventscript_call_names[call]));
-                       talloc_free(state);
                        return -1;
                }
        }
 
+       /* Do not run new monitor events if some event is already
+        * running, unless the running event is a monitor event, in
+        * which case running a new one should cancel the old one. */
+       if (call == CTDB_EVENT_MONITOR &&
+           ctdb->active_events > 0 &&
+           ctdb->current_monitor == NULL) {
+               if (callback != NULL) {
+                       callback(ctdb, -ECANCELED, private_data);
+               }
+               return 0;
+       }
+
        /* Kill off any running monitor events to run this event. */
        if (ctdb->current_monitor) {
                struct ctdb_event_script_state *ms = talloc_get_type(ctdb->current_monitor, struct ctdb_event_script_state);
 
-               /* cancel it */
-               if (ms->callback != NULL) {
+               /* Cancel current monitor callback state only if monitoring
+                * context ctdb->monitor->monitor_context has not been freed */
+               if (ms->callback != NULL && !ctdb_stopped_monitoring(ctdb)) {
                        ms->callback->fn(ctdb, -ECANCELED, ms->callback->private_data);
                        talloc_free(ms->callback);
                }
@@ -757,6 +765,36 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
                ctdb->current_monitor = NULL;
        }
 
+       state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       /* The callback isn't done if the context is freed. */
+       state->callback = talloc(mem_ctx, struct event_script_callback);
+       CTDB_NO_MEMORY(ctdb, state->callback);
+       DLIST_ADD(ctdb->script_callbacks, state->callback);
+       talloc_set_destructor(state->callback, remove_callback);
+       state->callback->ctdb         = ctdb;
+       state->callback->fn           = callback;
+       state->callback->private_data = private_data;
+
+       state->ctdb = ctdb;
+       state->call = call;
+       state->options = talloc_vasprintf(state, fmt, ap);
+       state->timeout = timeval_set(ctdb->tunable.script_timeout, 0);
+       state->scripts = NULL;
+       if (state->options == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n"));
+               talloc_free(state);
+               return -1;
+       }
+       if (!check_options(state->call, state->options)) {
+               DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for '%s'\n",
+                                 state->options,
+                                 ctdb_eventscript_call_names[state->call]));
+               talloc_free(state);
+               return -1;
+       }
+
        DEBUG(DEBUG_INFO,(__location__ " Starting eventscript %s %s\n",
                          ctdb_eventscript_call_names[state->call],
                          state->options));
@@ -770,14 +808,17 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
        state->current = 0;
        state->child = 0;
 
-       if (!from_user && (call == CTDB_EVENT_MONITOR || call == CTDB_EVENT_STATUS)) {
+       if (call == CTDB_EVENT_MONITOR) {
                ctdb->current_monitor = state;
        }
 
        talloc_set_destructor(state, event_script_destructor);
 
+       ctdb->active_events++;
+
        /* Nothing to do? */
        if (state->scripts->num_scripts == 0) {
+               callback(ctdb, 0, private_data);
                talloc_free(state);
                return 0;
        }
@@ -790,7 +831,10 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
        }
 
        if (!timeval_is_zero(&state->timeout)) {
-               event_add_timed(ctdb->ev, state, timeval_current_ofs(state->timeout.tv_sec, state->timeout.tv_usec), ctdb_event_script_timeout, state);
+               tevent_add_timer(ctdb->ev, state,
+                                timeval_current_ofs(state->timeout.tv_sec,
+                                                    state->timeout.tv_usec),
+                                ctdb_event_script_timeout, state);
        } else {
                DEBUG(DEBUG_ERR, (__location__ " eventscript %s %s called with no timeout\n",
                                  ctdb_eventscript_call_names[state->call],
@@ -802,14 +846,13 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
 
 
 /*
-  run the event script in the background, calling the callback when 
+  run the event script in the background, calling the callback when
   finished.  If mem_ctx is freed, callback will never be called.
  */
-int ctdb_event_script_callback(struct ctdb_context *ctdb, 
+int ctdb_event_script_callback(struct ctdb_context *ctdb,
                               TALLOC_CTX *mem_ctx,
                               void (*callback)(struct ctdb_context *, int, void *),
                               void *private_data,
-                              bool from_user,
                               enum ctdb_eventscript_call call,
                               const char *fmt, ...)
 {
@@ -817,7 +860,7 @@ int ctdb_event_script_callback(struct ctdb_context *ctdb,
        int ret;
 
        va_start(ap, fmt);
-       ret = ctdb_event_script_callback_v(ctdb, mem_ctx, callback, private_data, from_user, call, fmt, ap);
+       ret = ctdb_event_script_callback_v(ctdb, mem_ctx, callback, private_data, call, fmt, ap);
        va_end(ap);
 
        return ret;
@@ -848,27 +891,31 @@ int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call
 {
        va_list ap;
        int ret;
-       struct callback_status status;
+       struct callback_status status = {
+               .status = -1,
+               .done = false,
+       };
 
        va_start(ap, fmt);
        ret = ctdb_event_script_callback_v(ctdb, ctdb,
-                       event_script_callback, &status, false, call, fmt, ap);
+                       event_script_callback, &status, call, fmt, ap);
+       va_end(ap);
        if (ret != 0) {
                return ret;
        }
-       va_end(ap);
 
-       status.status = -1;
-       status.done = false;
-
-       while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */;
+       while (status.done == false && tevent_loop_once(ctdb->ev) == 0) /* noop */;
 
        if (status.status == -ETIME) {
                DEBUG(DEBUG_ERR, (__location__ " eventscript for '%s' timedout."
                                  " Immediately banning ourself for %d seconds\n",
                                  ctdb_eventscript_call_names[call],
                                  ctdb->tunable.recovery_ban_period));
-               ctdb_ban_self(ctdb);
+
+               /* Don't ban self if CTDB is starting up or shutting down */
+               if (call != CTDB_EVENT_INIT && call != CTDB_EVENT_SHUTDOWN) {
+                       ctdb_ban_self(ctdb);
+               }
        }
 
        return status.status;
@@ -887,19 +934,26 @@ struct eventscript_callback_state {
 /*
   called when a forced eventscript run has finished
  */
-static void run_eventscripts_callback(struct ctdb_context *ctdb, int status, 
+static void run_eventscripts_callback(struct ctdb_context *ctdb, int status,
                                 void *private_data)
 {
-       struct eventscript_callback_state *state = 
-               talloc_get_type(private_data, struct eventscript_callback_state);
+       const char *errmsg = NULL;
 
-       ctdb_enable_monitoring(ctdb);
+       struct eventscript_callback_state *state =
+               talloc_get_type(private_data, struct eventscript_callback_state);
 
        if (status != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to forcibly run eventscripts\n"));
+               if (status == -ECANCELED) {
+                       DEBUG(DEBUG_WARNING,
+                             (__location__ " Eventscript cancelled\n"));
+                       errmsg = "cancelled";
+               } else {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ " Failed to run eventscripts\n"));
+               }
        }
 
-       ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+       ctdb_request_control_reply(ctdb, state->c, NULL, status, errmsg);
        /* This will free the struct ctdb_event_script_state we are in! */
        talloc_free(state);
        return;
@@ -942,7 +996,7 @@ int32_t ctdb_run_eventscripts(struct ctdb_context *ctdb,
        /* Figure out what call they want. */
        options = get_call((const char *)indata.dptr, &call);
        if (!options) {
-               DEBUG(DEBUG_ERR, (__location__ " Invalid forced \"%s\"\n", (const char *)indata.dptr));
+               DEBUG(DEBUG_ERR, (__location__ " Invalid event name \"%s\"\n", (const char *)indata.dptr));
                return -1;
        }
 
@@ -956,16 +1010,13 @@ int32_t ctdb_run_eventscripts(struct ctdb_context *ctdb,
 
        state->c = talloc_steal(state, c);
 
-       DEBUG(DEBUG_NOTICE,("Forced running of eventscripts with arguments %s\n", indata.dptr));
-
-       ctdb_disable_monitoring(ctdb);
+       DEBUG(DEBUG_NOTICE,("Running eventscripts with arguments %s\n", indata.dptr));
 
-       ret = ctdb_event_script_callback(ctdb, 
-                        state, run_eventscripts_callback, state,
-                        true, call, "%s", options);
+       ret = ctdb_event_script_callback(ctdb,
+                        ctdb, run_eventscripts_callback, state,
+                        call, "%s", options);
 
        if (ret != 0) {
-               ctdb_enable_monitoring(ctdb);
                DEBUG(DEBUG_ERR,(__location__ " Failed to run eventscripts with arguments %s\n", indata.dptr));
                talloc_free(state);
                return -1;
@@ -1004,7 +1055,7 @@ int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata)
        }
 
 
-       if (stat(ctdb->event_script_dir, &st) != 0 && 
+       if (stat(ctdb->event_script_dir, &st) != 0 &&
            errno == ENOENT) {
                DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
                talloc_free(tmp_ctx);
@@ -1060,7 +1111,7 @@ int32_t ctdb_control_disable_script(struct ctdb_context *ctdb, TDB_DATA indata)
        }
 
 
-       if (stat(ctdb->event_script_dir, &st) != 0 && 
+       if (stat(ctdb->event_script_dir, &st) != 0 &&
            errno == ENOENT) {
                DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
                talloc_free(tmp_ctx);