eventscript: handle banning within the callbacks

author Rusty Russell <rusty@rustcorp.com.au>

Mon, 7 Dec 2009 13:18:57 +0000 (23:48 +1030)

committer Rusty Russell <rusty@rustcorp.com.au>

Mon, 7 Dec 2009 13:18:57 +0000 (23:48 +1030)
author Rusty Russell <rusty@rustcorp.com.au>
Mon, 7 Dec 2009 13:18:57 +0000 (23:48 +1030)
committer Rusty Russell <rusty@rustcorp.com.au>
Mon, 7 Dec 2009 13:18:57 +0000 (23:48 +1030)
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c

index f4223772b6c8cef06f8d8aff0d7111fe31b2f18a..c7bbb48a2558cad202cd6a03f08fc9291ba70d06 100644 (file)
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@@ -123,6 +123,17 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
         rddata.dptr = (uint8_t *)&rd;
         rddata.dsize = sizeof(rd);
  
+       if (status == -ETIME) {
+               ctdb->event_script_timeouts++;
+
+               if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
+                       DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count));
+               } else {
+                       /* We pretend this is OK. */
+                       status = 0;
+               }
+       }
+
         if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
                 DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
                 node->flags |= NODE_FLAGS_UNHEALTHY;
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c

index da11bdb8e0ecfc912eeb19e87d77fd9028863e82..8568e8bbe07e59984f8255ab0dd9b31f6b7aa4c6 100644 (file)
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -935,6 +935,9 @@ static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, vo
  
         if (status != 0) {
                 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
         }
  
         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
@@ -1210,6 +1213,9 @@ static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void
         if (status != 0) {
                 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
                 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
         }
  
         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c

index 7cb8383114b590f101e969c5d208c3ef6b95338e..7357c0ca54184e17d9f95e4851d55d75a3cb9573 100644 (file)
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -128,6 +128,9 @@ static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
         struct ctdb_tcp_array *tcparray;
  
         if (status != 0) {
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
                         ctdb_addr_to_str(state->addr),
                         state->vnn->iface));
@@ -323,6 +326,10 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
                 talloc_get_type(private_data, struct takeover_callback_state);
         TDB_DATA data;
  
+       if (status == -ETIME) {
+               ctdb_ban_self(ctdb);
+       }
+
         /* send a message to all clients of this node telling them
            that the cluster has been reconfigured and they should
            release any sockets on this IP */
diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c

index a6d3950bada07b9b454c3b6e24527697eff914cb..5dca0cea5782db24f846ef44e1bf0bbe89e004cf 100644 (file)
--- a/ctdb/server/eventscript.c
+++ b/ctdb/server/eventscript.c
@@ -633,57 +633,24 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
         DEBUG(DEBUG_ERR,("Event script timed out : %s %s count : %u  pid : %d\n",
                          call_names[state->call], state->options, ctdb->event_script_timeouts, state->child));
  
+       state->cb_status = -ETIME;
+
         if (kill(state->child, 0) != 0) {
                 DEBUG(DEBUG_ERR,("Event script child process already dead, errno %s(%d)\n", strerror(errno), errno));
                 state->child = 0;
-               state->cb_status = -ETIME;
-               talloc_free(state);
-               return;
-       }
-
-       if (state->call == CTDB_EVENT_MONITOR) {
-               /* if it is a monitor event, we allow it to "hang" a few times
-                  before we declare it a failure and ban ourself (and make
-                  ourself unhealthy)
-               */
-               DEBUG(DEBUG_ERR, (__location__ " eventscript for monitor event timedout.\n"));
-
-               ctdb->event_script_timeouts++;
-
-               if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
-                       DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count));
-                       state->cb_status = -ETIME;
-               } else {
-                       state->cb_status = 0;
-               }
-       } else if (state->call == CTDB_EVENT_STARTUP) {
-               DEBUG(DEBUG_ERR, (__location__ " eventscript for startup event timedout.\n"));
-               state->cb_status = -ETIME;
-       } else {
-               /* if it is not a monitor or a startup event we ban ourself
-                  immediately
-               */
-               DEBUG(DEBUG_ERR, (__location__ " eventscript for NON-monitor/NON-startup event timedout. Immediately banning ourself for %d seconds\n", ctdb->tunable.recovery_ban_period));
-
-               ctdb_ban_self(ctdb);
-
-               state->cb_status = -ETIME;
         }
  
         if (state->call == CTDB_EVENT_MONITOR || state->call == CTDB_EVENT_STATUS) {
                 struct ctdb_monitor_script_status *script;
  
-               if (ctdb->current_monitor_status_ctx == NULL) {
-                       talloc_free(state);
-                       return;
-               }
+               if (ctdb->current_monitor_status_ctx != NULL) {
+                       script = ctdb->current_monitor_status_ctx->scripts;
+                       if (script != NULL) {
+                               script->status = state->cb_status;
+                       }
  
-               script = ctdb->current_monitor_status_ctx->scripts;
-               if (script != NULL) {
-                       script->status = state->cb_status;
+                       ctdb_control_event_script_finished(ctdb);
                 }
-
-               ctdb_control_event_script_finished(ctdb);
         }
  
         talloc_free(state);
@@ -930,6 +897,14 @@ int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call
  
         while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */;
  
+       if (status.status == -ETIME) {
+               DEBUG(DEBUG_ERR, (__location__ " eventscript for '%s' timedout."
+                                 " Immediately banning ourself for %d seconds\n",
+                                 call_names[call],
+                                 ctdb->tunable.recovery_ban_period));
+               ctdb_ban_self(ctdb);
+       }
+
         return status.status;
  }
author	Rusty Russell <rusty@rustcorp.com.au>
	Mon, 7 Dec 2009 13:18:57 +0000 (23:48 +1030)
committer	Rusty Russell <rusty@rustcorp.com.au>
	Mon, 7 Dec 2009 13:18:57 +0000 (23:48 +1030)
ctdb/server/ctdb_monitor.c		patch \| blob \| history
ctdb/server/ctdb_recover.c		patch \| blob \| history
ctdb/server/ctdb_takeover.c		patch \| blob \| history
ctdb/server/eventscript.c		patch \| blob \| history