Break this debug and datacollection out into an external script to make it easier to modify what data we need to collect.
For now we only collect a pstree so we can see what part of the script we hung in.
S1037271
if [ -f doc/ltdbtool.1 ]; then ${INSTALLCMD} -m 644 doc/ltdbtool.1 $(DESTDIR)$(mandir)/man1; fi
if [ -f doc/ping_pong.1 ];then ${INSTALLCMD} -m 644 doc/ping_pong.1 $(DESTDIR)$(mandir)/man1; fi
if [ ! -f $(DESTDIR)$(etcdir)/ctdb/notify.sh ];then ${INSTALLCMD} -m 755 config/notify.sh $(DESTDIR)$(etcdir)/ctdb; fi
+ ${INSTALLCMD} -m 755 config/debug-hung-script.sh $(DESTDIR)$(etcdir)/ctdb
if [ ! -f $(DESTDIR)$(etcdir)/ctdb/ctdb-crash-cleanup.sh ];then ${INSTALLCMD} -m 755 config/ctdb-crash-cleanup.sh $(DESTDIR)$(etcdir)/ctdb; fi
install_pmda:
maybe_set "--event-script-dir" "$CTDB_EVENT_SCRIPT_DIR"
maybe_set "--transport" "$CTDB_TRANSPORT"
maybe_set "-d" "$CTDB_DEBUGLEVEL"
+ maybe_set "--debug-hung-script" "$CTDB_DEBUG_HUNG_SCRIPT"
maybe_set "--notification-script" "$CTDB_NOTIFY_SCRIPT"
maybe_set "--start-as-disabled" "$CTDB_START_AS_DISABLED" "yes"
maybe_set "--start-as-stopped " "$CTDB_START_AS_STOPPED" "yes"
# a script to run when node health changes
# CTDB_NOTIFY_SCRIPT=/etc/ctdb/notify.sh
+# a script to collect data when an eventscript has hung
+# CTDB_DEBUG_HUNG_SCRIPT=/etc/ctdb/debug-hung-script.sh
+
# the directory to put the local ctdb database files in
# defaults to /var/ctdb
# CTDB_DBDIR=/var/ctdb
--- /dev/null
+#!/bin/sh
+
+. $CTDB_BASE/functions
+loadconfig
+
+echo "Pstree output for the hung script:"
+pstree -p -a $1
bool do_setsched;
void *saved_scheduler_param;
const char *event_script_dir;
+ const char *debug_hung_script;
const char *notification_script;
const char *default_public_interface;
pid_t ctdbd_pid;
const char *public_addresses_file;
struct trbt_tree *child_processes;
+ TALLOC_CTX *debug_hung_script_ctx;
};
struct ctdb_db_context {
const char *ip);
int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script);
int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir);
+int ctdb_set_debug_hung_script(struct ctdb_context *ctdb, const char *script);
int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script);
void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn);
int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap);
%config(noreplace) %{_sysconfdir}/sysconfig/ctdb
%config(noreplace) %{_sysconfdir}/ctdb/notify.sh
+%config(noreplace) %{_sysconfdir}/ctdb/debug-hung-script.sh
%config(noreplace) %{_sysconfdir}/ctdb/ctdb-crash-cleanup.sh
%config(noreplace) %{_sysconfdir}/ctdb/functions
%attr(755,root,root) %{initdir}/ctdb
{ "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval), false },
{ "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval), false },
{ "EventScriptTimeout", 30, offsetof(struct ctdb_tunable, script_timeout), false },
- { "EventScriptTimeoutCount", 1, offsetof(struct ctdb_tunable, script_timeout_count), false },
+ { "EventScriptTimeoutCount", 20, offsetof(struct ctdb_tunable, script_timeout_count), false },
{ "EventScriptUnhealthyOnTimeout", 0, offsetof(struct ctdb_tunable, script_unhealthy_on_timeout), true },
{ "RecoveryGracePeriod", 120, offsetof(struct ctdb_tunable, recovery_grace_period), false },
{ "RecoveryBanPeriod", 300, offsetof(struct ctdb_tunable, recovery_ban_period), false },
const char *public_address_list;
const char *event_script_dir;
const char *notification_script;
+ const char *debug_hung_script;
const char *logfile;
const char *recovery_lock_file;
const char *db_dir;
.db_dir_persistent = VARDIR "/ctdb/persistent",
.db_dir_state = VARDIR "/ctdb/state",
.script_log_level = DEBUG_ERR,
+ .debug_hung_script = "/etc/ctdb/debug-hung-script.sh",
};
int script_log_level;
{ "nlist", 0, POPT_ARG_STRING, &options.nlist, 0, "node list file", "filename" },
{ "node-ip", 0, POPT_ARG_STRING, &options.node_ip, 0, "node ip", "ip-address"},
{ "notification-script", 0, POPT_ARG_STRING, &options.notification_script, 0, "notification script", "filename" },
+ { "debug-hung-script", 0, POPT_ARG_STRING, &options.debug_hung_script, 0, "debug script for hung eventscripts", "filename" },
{ "listen", 0, POPT_ARG_STRING, &options.myaddress, 0, "address to listen on", "address" },
{ "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL },
{ "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL },
}
}
+ if (options.debug_hung_script != NULL) {
+ ret = ctdb_set_debug_hung_script(ctdb, options.debug_hung_script);
+ }
+ if (ret == -1) {
+ DEBUG(DEBUG_ALERT,("Unable to setup script to debug hung eventscripts\n"));
+ exit(1);
+ }
+
ctdb->valgrinding = options.valgrinding;
if (options.valgrinding || options.nosetsched) {
ctdb->do_setsched = 0;
}
}
-static void debug_timeout(struct ctdb_event_script_state *state)
+/*
+ setup the script to debug hung eventscripts
+*/
+int ctdb_set_debug_hung_script(struct ctdb_context *ctdb, const char *script)
+{
+ ctdb->debug_hung_script = talloc_strdup(ctdb, script);
+ CTDB_NO_MEMORY(ctdb, ctdb->debug_hung_script);
+ return 0;
+}
+
+static void ctdb_run_debug_hung_script(struct ctdb_context *ctdb, struct ctdb_event_script_state *state)
{
struct ctdb_script_wire *current = get_current_script(state);
char *cmd;
pid_t pid;
- time_t t;
- char tbuf[100], buf[200];
- cmd = child_command_string(state->ctdb, state,
+ cmd = child_command_string(ctdb, state,
state->from_user, current->name,
state->call, state->options);
CTDB_NO_MEMORY_VOID(state->ctdb, cmd);
cmd, timeval_elapsed(¤t->start), state->child));
talloc_free(cmd);
- t = time(NULL);
- strftime(tbuf, sizeof(tbuf)-1, "%Y%m%d%H%M%S", localtime(&t));
- sprintf(buf, "{ pstree -p; cat /proc/locks; ls -li /var/ctdb/ /var/ctdb/persistent; }"
- " >/tmp/ctdb.event.%s.%d", tbuf, getpid());
-
- pid = ctdb_fork(state->ctdb);
- if (pid == 0) {
- system(buf);
- /* Now we can kill the child */
+ talloc_free(ctdb->debug_hung_script_ctx);
+ ctdb->debug_hung_script_ctx = talloc_new(ctdb);
+ if (!ctdb_fork_with_logging(ctdb->debug_hung_script_ctx, ctdb, "Hung script", NULL, NULL, &pid)) {
+ DEBUG(DEBUG_ERR,("Failed to fork a child process with logging to track hung event script\n"));
ctdb_kill(state->ctdb, state->child, SIGTERM);
- exit(0);
+ return;
}
if (pid == -1) {
DEBUG(DEBUG_ERR,("Fork for debug script failed : %s\n",
strerror(errno)));
- } else {
- DEBUG(DEBUG_ERR,("Logged timedout eventscript : %s\n", buf));
- /* Don't kill child until timeout done. */
- state->child = 0;
+ ctdb_kill(state->ctdb, state->child, SIGTERM);
+ return;
+ }
+ if (pid == 0) {
+ struct stat st;
+ char buf[200];
+
+ if (stat(ctdb->debug_hung_script, &st) != 0) {
+ DEBUG(DEBUG_ERR,("Failed to stat the script to debug hung eventscript. Is it not installed correctly? (script:%s)\n", ctdb->debug_hung_script));
+ ctdb_kill(state->ctdb, state->child, SIGTERM);
+ _exit(0);
+ }
+ if (!(st.st_mode & S_IXUSR)) {
+ DEBUG(DEBUG_DEBUG,("Debug script %s is not executable.\n", ctdb->debug_hung_script));
+ ctdb_kill(state->ctdb, state->child, SIGTERM);
+ _exit(0);
+ }
+
+ sprintf(buf, "%s %d", ctdb->debug_hung_script, state->child);
+ system(buf);
+
+ /* Now we can kill the child */
+ ctdb_kill(state->ctdb, state->child, SIGTERM);
+ _exit(0);
}
+
+ /* Don't kill child until timeout done. */
+ state->child = 0;
}
/* called when child times out */
case CTDB_EVENT_TAKE_IP:
case CTDB_EVENT_RELEASE_IP:
case CTDB_EVENT_STOPPED:
- case CTDB_EVENT_MONITOR:
case CTDB_EVENT_STATUS:
state->scripts->scripts[state->current].status = 0;
DEBUG(DEBUG_ERR,("Ignoring hung script for %s call %d\n", state->options, state->call));
+ ctdb_run_debug_hung_script(ctdb, state);
break;
default:
state->scripts->scripts[state->current].status = -ETIME;
- debug_timeout(state);
+ ctdb_run_debug_hung_script(ctdb, state);
}
talloc_free(state);