4 Copyright (C) Andrew Tridgell 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/wait.h"
23 #include "system/dir.h"
24 #include "system/locale.h"
25 #include "../include/ctdb_private.h"
26 #include "lib/events/events.h"
27 #include "../common/rb_tree.h"
31 const char *script_running;
35 ctdbd sends us a SIGTERM when we should time out the current script
37 static void sigterm(int sig)
39 DEBUG(DEBUG_ERR,("Timed out running script '%s' after %.1f seconds\n",
40 child_state.script_running, timeval_elapsed(&child_state.start)));
41 /* all the child processes will be running in the same process group */
42 kill(-getpgrp(), SIGKILL);
46 struct ctdb_event_script_state {
47 struct ctdb_context *ctdb;
49 void (*callback)(struct ctdb_context *, int, void *);
56 struct ctdb_monitor_script_status {
57 struct ctdb_monitor_script_status *next;
60 struct timeval finished;
66 struct ctdb_monitoring_status {
68 struct timeval finished;
70 struct ctdb_monitor_script_status *scripts;
74 /* called from ctdb_logging when we have received output on STDERR from
75 * one of the eventscripts
77 int ctdb_log_event_script_output(struct ctdb_context *ctdb, char *str, uint16_t len)
79 struct ctdb_monitoring_status *monitoring_status =
80 talloc_get_type(ctdb->script_monitoring_ctx,
81 struct ctdb_monitoring_status);
82 struct ctdb_monitor_script_status *script;
84 if (monitoring_status == NULL) {
88 script = monitoring_status->scripts;
93 if (script->output == NULL) {
94 script->output = talloc_asprintf(script, "%*.*s", len, len, str);
96 script->output = talloc_asprintf_append(script->output, "%*.*s", len, len, str);
102 /* called from the event script child process when we are starting a new
105 int32_t ctdb_control_event_script_init(struct ctdb_context *ctdb)
107 struct ctdb_monitoring_status *monitoring_status;
109 DEBUG(DEBUG_INFO, ("event script init called\n"));
110 if (ctdb->script_monitoring_ctx != NULL) {
111 talloc_free(ctdb->script_monitoring_ctx);
112 ctdb->script_monitoring_ctx = NULL;
115 monitoring_status = talloc_zero(ctdb, struct ctdb_monitoring_status);
116 if (monitoring_status == NULL) {
117 DEBUG(DEBUG_ERR, (__location__ " ERROR: Failed to talloc script_monitoring context\n"));
121 ctdb->script_monitoring_ctx = monitoring_status;
122 monitoring_status->start = timeval_current();
128 /* called from the event script child process when we are star running
131 int32_t ctdb_control_event_script_start(struct ctdb_context *ctdb, TDB_DATA indata)
133 const char *name = (const char *)indata.dptr;
134 struct ctdb_monitoring_status *monitoring_status =
135 talloc_get_type(ctdb->script_monitoring_ctx,
136 struct ctdb_monitoring_status);
137 struct ctdb_monitor_script_status *script;
139 DEBUG(DEBUG_INFO, ("event script start called : %s\n", name));
141 if (monitoring_status == NULL) {
142 DEBUG(DEBUG_ERR,(__location__ " script_status is NULL when starting to run script %s\n", name));
146 script = talloc_zero(monitoring_status, struct ctdb_monitor_script_status);
147 if (script == NULL) {
148 DEBUG(DEBUG_ERR,(__location__ " Failed to talloc ctdb_monitor_script_status for script %s\n", name));
152 script->next = monitoring_status->scripts;
153 script->name = talloc_strdup(script, name);
154 CTDB_NO_MEMORY(ctdb, script->name);
155 script->start = timeval_current();
156 monitoring_status->scripts = script;
161 /* called from the event script child process when we have finished running
164 int32_t ctdb_control_event_script_stop(struct ctdb_context *ctdb, TDB_DATA indata)
166 int32_t res = *((int32_t *)indata.dptr);
167 struct ctdb_monitoring_status *monitoring_status =
168 talloc_get_type(ctdb->script_monitoring_ctx,
169 struct ctdb_monitoring_status);
170 struct ctdb_monitor_script_status *script;
172 DEBUG(DEBUG_INFO, ("event script stop called : %d\n", (int)res));
174 if (monitoring_status == NULL) {
175 DEBUG(DEBUG_ERR,(__location__ " script_status is NULL when script finished.\n"));
179 script = monitoring_status->scripts;
180 if (script == NULL) {
181 DEBUG(DEBUG_ERR,(__location__ " script is NULL when the script had finished\n"));
185 script->finished = timeval_current();
186 script->status = res;
191 /* called from the event script child process when we have completed a
194 int32_t ctdb_control_event_script_finished(struct ctdb_context *ctdb)
196 struct ctdb_monitoring_status *monitoring_status =
197 talloc_get_type(ctdb->script_monitoring_ctx,
198 struct ctdb_monitoring_status);
200 DEBUG(DEBUG_INFO, ("event script finished called\n"));
202 if (monitoring_status == NULL) {
203 DEBUG(DEBUG_ERR,(__location__ " script_status is NULL when monitoring event finished\n"));
207 monitoring_status->finished = timeval_current();
208 monitoring_status->status = MONITOR_SCRIPT_OK;
209 if (ctdb->last_monitoring_ctx) {
210 talloc_free(ctdb->last_monitoring_ctx);
212 ctdb->last_monitoring_ctx = ctdb->script_monitoring_ctx;
213 ctdb->script_monitoring_ctx = NULL;
218 static struct ctdb_monitoring_wire *marshall_monitoring_scripts(TALLOC_CTX *mem_ctx, struct ctdb_monitoring_wire *monitoring_scripts, struct ctdb_monitor_script_status *script)
220 struct ctdb_monitoring_script_wire script_wire;
223 if (script == NULL) {
224 return monitoring_scripts;
226 monitoring_scripts = marshall_monitoring_scripts(mem_ctx, monitoring_scripts, script->next);
227 if (monitoring_scripts == NULL) {
231 bzero(&script_wire, sizeof(struct ctdb_monitoring_script_wire));
232 strncpy(script_wire.name, script->name, MAX_SCRIPT_NAME);
233 script_wire.start = script->start;
234 script_wire.finished = script->finished;
235 script_wire.status = script->status;
236 script_wire.timedout = script->timedout;
237 if (script->output != NULL) {
238 strncpy(script_wire.output, script->output, MAX_SCRIPT_OUTPUT);
241 size = talloc_get_size(monitoring_scripts);
242 monitoring_scripts = talloc_realloc_size(mem_ctx, monitoring_scripts, size + sizeof(struct ctdb_monitoring_script_wire));
243 if (monitoring_scripts == NULL) {
244 DEBUG(DEBUG_ERR,(__location__ " Failed to talloc_resize monitoring_scripts blob\n"));
248 memcpy(&monitoring_scripts->scripts[monitoring_scripts->num_scripts], &script_wire, sizeof(script_wire));
249 monitoring_scripts->num_scripts++;
251 return monitoring_scripts;
254 int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb, TDB_DATA *outdata)
256 struct ctdb_monitoring_status *monitoring_status =
257 talloc_get_type(ctdb->last_monitoring_ctx,
258 struct ctdb_monitoring_status);
259 struct ctdb_monitoring_wire *monitoring_scripts;
261 if (monitoring_status == NULL) {
262 DEBUG(DEBUG_ERR,(__location__ " last_monitor_ctx is NULL when reading status\n"));
266 monitoring_scripts = talloc_size(outdata, offsetof(struct ctdb_monitoring_wire, scripts));
267 if (monitoring_scripts == NULL) {
268 DEBUG(DEBUG_ERR,(__location__ " failed to talloc monitoring_scripts structure\n"));
272 monitoring_scripts->num_scripts = 0;
273 monitoring_scripts = marshall_monitoring_scripts(outdata, monitoring_scripts, monitoring_status->scripts);
274 if (monitoring_scripts == NULL) {
275 DEBUG(DEBUG_ERR,(__location__ " Monitoring scritps is NULL. can not return data to client\n"));
279 outdata->dsize = talloc_get_size(monitoring_scripts);
280 outdata->dptr = (uint8_t *)monitoring_scripts;
286 run the event script - varargs version
287 this function is called and run in the context of a forked child
288 which allows it to do blocking calls such as system()
290 static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *options)
295 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
304 if (!strcmp(options, "monitor")) {
308 if (is_monitor == 1) {
309 /* This is running in the forked child process. At this stage
310 * we want to switch from being a ctdb daemon into being a
311 * client and connect to the real local daemon.
313 if (switch_from_server_to_client(ctdb) != 0) {
314 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch eventscript child into client mode. shutting down.\n"));
318 if (ctdb_ctrl_event_script_init(ctdb) != 0) {
319 DEBUG(DEBUG_ERR,(__location__ " Failed to init event script monitoring\n"));
320 talloc_free(tmp_ctx);
325 if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
326 /* we guarantee that only some specifically allowed event scripts are run
328 const char *allowed_scripts[] = {"startrecovery", "shutdown", "releaseip" };
330 for (i=0;i<ARRAY_SIZE(allowed_scripts);i++) {
331 if (strncmp(options, allowed_scripts[i], strlen(allowed_scripts[i])) == 0) break;
333 if (i == ARRAY_SIZE(allowed_scripts)) {
334 DEBUG(DEBUG_ERR,("Refusing to run event scripts with option '%s' while in recovery\n",
336 talloc_free(tmp_ctx);
341 if (setpgid(0,0) != 0) {
342 DEBUG(DEBUG_ERR,("Failed to create process group for event scripts - %s\n",
344 talloc_free(tmp_ctx);
348 signal(SIGTERM, sigterm);
350 child_state.start = timeval_current();
351 child_state.script_running = "startup";
354 the service specific event scripts
356 if (stat(ctdb->event_script_dir, &st) != 0 &&
358 DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
359 talloc_free(tmp_ctx);
363 /* create a tree to store all the script names in */
364 tree = trbt_create(tmp_ctx, 0);
366 /* scan all directory entries and insert all valid scripts into the
369 dir = opendir(ctdb->event_script_dir);
371 DEBUG(DEBUG_CRIT,("Failed to open event script directory '%s'\n", ctdb->event_script_dir));
372 talloc_free(tmp_ctx);
377 while ((de=readdir(dir)) != NULL) {
382 namlen = strlen(de->d_name);
388 if (de->d_name[namlen-1] == '~') {
389 /* skip files emacs left behind */
393 if (de->d_name[2] != '.') {
397 if (sscanf(de->d_name, "%02u.", &num) != 1) {
401 /* Make sure the event script is executable */
402 str = talloc_asprintf(tree, "%s/%s", ctdb->event_script_dir, de->d_name);
403 if (stat(str, &st) != 0) {
404 DEBUG(DEBUG_ERR,("Could not stat event script %s. Ignoring this event script\n", str));
407 if (!(st.st_mode & S_IXUSR)) {
408 DEBUG(DEBUG_ERR,("Event script %s is not executable. Ignoring this event script\n", str));
413 /* store the event script in the tree */
414 d_name_dup = talloc_strdup(tree, de->d_name);
415 CTDB_NO_MEMORY(ctdb, d_name_dup);
416 trbt_insert32(tree, (num<<16)|count++, d_name_dup);
420 /* fetch the scripts from the tree one by one and execute
423 while ((script=trbt_findfirstarray32(tree, 1)) != NULL) {
424 cmdstr = talloc_asprintf(tmp_ctx, "%s/%s %s",
425 ctdb->event_script_dir,
427 CTDB_NO_MEMORY(ctdb, cmdstr);
429 DEBUG(DEBUG_INFO,("Executing event script %s\n",cmdstr));
431 child_state.start = timeval_current();
432 child_state.script_running = cmdstr;
434 if (is_monitor == 1) {
435 if (ctdb_ctrl_event_script_start(ctdb, script) != 0) {
436 DEBUG(DEBUG_ERR,(__location__ " Failed to start event script monitoring\n"));
437 talloc_free(tmp_ctx);
442 ret = system(cmdstr);
443 /* if the system() call was successful, translate ret into the
444 return code from the command
447 ret = WEXITSTATUS(ret);
449 if (is_monitor == 1) {
450 if (ctdb_ctrl_event_script_stop(ctdb, ret) != 0) {
451 DEBUG(DEBUG_ERR,(__location__ " Failed to stop event script monitoring\n"));
452 talloc_free(tmp_ctx);
457 /* return an error if the script failed */
459 DEBUG(DEBUG_ERR,("Event script %s failed with error %d\n", cmdstr, ret));
460 if (is_monitor == 1) {
461 if (ctdb_ctrl_event_script_finished(ctdb) != 0) {
462 DEBUG(DEBUG_ERR,(__location__ " Failed to finish event script monitoring\n"));
463 talloc_free(tmp_ctx);
468 talloc_free(tmp_ctx);
472 /* remove this script from the tree */
476 child_state.start = timeval_current();
477 child_state.script_running = "finished";
479 if (is_monitor == 1) {
480 if (ctdb_ctrl_event_script_finished(ctdb) != 0) {
481 DEBUG(DEBUG_ERR,(__location__ " Failed to finish event script monitoring\n"));
482 talloc_free(tmp_ctx);
487 talloc_free(tmp_ctx);
491 /* called when child is finished */
492 static void ctdb_event_script_handler(struct event_context *ev, struct fd_event *fde,
493 uint16_t flags, void *p)
495 struct ctdb_event_script_state *state =
496 talloc_get_type(p, struct ctdb_event_script_state);
497 void (*callback)(struct ctdb_context *, int, void *) = state->callback;
498 void *private_data = state->private_data;
499 struct ctdb_context *ctdb = state->ctdb;
502 read(state->fd[0], &rt, sizeof(rt));
504 talloc_set_destructor(state, NULL);
506 callback(ctdb, rt, private_data);
508 ctdb->event_script_timeouts = 0;
511 static void ctdb_ban_self(struct ctdb_context *ctdb, uint32_t ban_period)
514 struct ctdb_ban_info b;
518 b.ban_time = ban_period;
520 data.dptr = (uint8_t *)&b;
521 data.dsize = sizeof(b);
523 ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
524 CTDB_SRVID_BAN_NODE, data);
526 DEBUG(DEBUG_ERR,(__location__ " Failed to send ban message\n"));
531 /* called when child times out */
532 static void ctdb_event_script_timeout(struct event_context *ev, struct timed_event *te,
533 struct timeval t, void *p)
535 struct ctdb_event_script_state *state = talloc_get_type(p, struct ctdb_event_script_state);
536 void (*callback)(struct ctdb_context *, int, void *) = state->callback;
537 void *private_data = state->private_data;
538 struct ctdb_context *ctdb = state->ctdb;
540 struct ctdb_monitoring_status *monitoring_status =
541 talloc_get_type(ctdb->script_monitoring_ctx,
542 struct ctdb_monitoring_status);
544 DEBUG(DEBUG_ERR,("Event script timed out : %s count : %u\n", state->options, ctdb->event_script_timeouts));
546 options = talloc_strdup(ctdb, state->options);
547 CTDB_NO_MEMORY_VOID(ctdb, options);
550 if (!strcmp(options, "monitor")) {
551 /* if it is a monitor event, we allow it to "hang" a few times
552 before we declare it a failure and ban ourself (and make
555 DEBUG(DEBUG_ERR, (__location__ " eventscript for monitor event timedout.\n"));
557 ctdb->event_script_timeouts++;
558 if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
559 ctdb->event_script_timeouts = 0;
560 DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Banning self for %d seconds\n", ctdb->tunable.script_ban_count, ctdb->tunable.recovery_ban_period));
561 ctdb_ban_self(ctdb, ctdb->tunable.recovery_ban_period);
562 callback(ctdb, -1, private_data);
564 callback(ctdb, 0, private_data);
566 } else if (!strcmp(options, "startup")) {
567 DEBUG(DEBUG_ERR, (__location__ " eventscript for startup event timedout.\n"));
568 callback(ctdb, -1, private_data);
570 /* if it is not a monitor event we ban ourself immediately */
571 DEBUG(DEBUG_ERR, (__location__ " eventscript for NON-monitor/NON-startup event timedout. Immediately banning ourself for %d seconds\n", ctdb->tunable.recovery_ban_period));
572 ctdb_ban_self(ctdb, ctdb->tunable.recovery_ban_period);
573 callback(ctdb, -1, private_data);
576 if (monitoring_status != NULL) {
577 struct ctdb_monitor_script_status *script;
579 script = monitoring_status->scripts;
580 if (script != NULL) {
581 script->timedout = 1;
583 monitoring_status->status = MONITOR_SCRIPT_TIMEOUT;
584 if (ctdb->last_monitoring_ctx) {
585 talloc_free(ctdb->last_monitoring_ctx);
586 ctdb->last_monitoring_ctx = ctdb->script_monitoring_ctx;
587 ctdb->script_monitoring_ctx = NULL;
591 talloc_free(options);
595 destroy a running event script
597 static int event_script_destructor(struct ctdb_event_script_state *state)
599 DEBUG(DEBUG_ERR,(__location__ " Sending SIGTERM to child pid:%d\n", state->child));
600 kill(state->child, SIGTERM);
605 run the event script in the background, calling the callback when
608 static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
609 struct timeval timeout,
611 void (*callback)(struct ctdb_context *, int, void *),
613 const char *fmt, va_list ap)
615 struct ctdb_event_script_state *state;
618 state = talloc(mem_ctx, struct ctdb_event_script_state);
619 CTDB_NO_MEMORY(ctdb, state);
622 state->callback = callback;
623 state->private_data = private_data;
624 state->options = talloc_vasprintf(state, fmt, ap);
625 CTDB_NO_MEMORY(ctdb, state->options);
627 ret = pipe(state->fd);
633 state->child = fork();
635 if (state->child == (pid_t)-1) {
642 if (state->child == 0) {
646 set_close_on_exec(state->fd[1]);
648 rt = ctdb_event_script_v(ctdb, state->options);
649 while ((ret = write(state->fd[1], &rt, sizeof(rt))) != sizeof(rt)) {
655 talloc_set_destructor(state, event_script_destructor);
659 event_add_fd(ctdb->ev, state, state->fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
660 ctdb_event_script_handler, state);
662 if (!timeval_is_zero(&timeout)) {
663 event_add_timed(ctdb->ev, state, timeout, ctdb_event_script_timeout, state);
665 DEBUG(DEBUG_ERR, (__location__ " eventscript %s called with no timeout\n", state->options));
673 run the event script in the background, calling the callback when
676 int ctdb_event_script_callback(struct ctdb_context *ctdb,
677 struct timeval timeout,
679 void (*callback)(struct ctdb_context *, int, void *),
681 const char *fmt, ...)
687 ret = ctdb_event_script_callback_v(ctdb, timeout, mem_ctx, callback, private_data, fmt, ap);
694 struct callback_status {
700 called when ctdb_event_script() finishes
702 static void event_script_callback(struct ctdb_context *ctdb, int status, void *private_data)
704 struct callback_status *s = (struct callback_status *)private_data;
710 run the event script, waiting for it to complete. Used when the caller doesn't want to
711 continue till the event script has finished.
713 int ctdb_event_script(struct ctdb_context *ctdb, const char *fmt, ...)
717 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
718 struct callback_status status;
721 ret = ctdb_event_script_callback_v(ctdb, timeval_zero(), tmp_ctx, event_script_callback, &status, fmt, ap);
725 talloc_free(tmp_ctx);
732 while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */;
734 talloc_free(tmp_ctx);
736 return status.status;
740 struct eventscript_callback_state {
741 struct ctdb_req_control *c;
745 called when takeip event finishes
747 static void run_eventscripts_callback(struct ctdb_context *ctdb, int status,
750 struct eventscript_callback_state *state =
751 talloc_get_type(private_data, struct eventscript_callback_state);
753 ctdb_enable_monitoring(ctdb);
756 DEBUG(DEBUG_ERR,(__location__ " Failed to forcibly run eventscripts\n"));
757 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
762 /* the control succeeded */
763 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
769 A control to force running of the eventscripts from the ctdb client tool
771 int32_t ctdb_run_eventscripts(struct ctdb_context *ctdb,
772 struct ctdb_req_control *c,
773 TDB_DATA indata, bool *async_reply)
776 struct eventscript_callback_state *state;
778 /* kill off any previous invokations of forced eventscripts */
779 if (ctdb->eventscripts_ctx) {
780 talloc_free(ctdb->eventscripts_ctx);
782 ctdb->eventscripts_ctx = talloc_new(ctdb);
783 CTDB_NO_MEMORY(ctdb, ctdb->eventscripts_ctx);
785 state = talloc(ctdb->eventscripts_ctx, struct eventscript_callback_state);
786 CTDB_NO_MEMORY(ctdb, state);
788 state->c = talloc_steal(state, c);
790 DEBUG(DEBUG_NOTICE,("Forced running of eventscripts with arguments %s\n", indata.dptr));
792 if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
793 DEBUG(DEBUG_ERR, (__location__ " Aborted running eventscript \"%s\" while in RECOVERY mode\n", indata.dptr));
797 ctdb_disable_monitoring(ctdb);
799 ret = ctdb_event_script_callback(ctdb,
800 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
801 state, run_eventscripts_callback, state,
802 (const char *)indata.dptr);
805 ctdb_enable_monitoring(ctdb);
806 DEBUG(DEBUG_ERR,(__location__ " Failed to run eventscripts with arguments %s\n", indata.dptr));
811 /* tell ctdb_control.c that we will be replying asynchronously */