}
}
- /* stop any monitoring */
+ /*
+ (Temporaily) Disabling monitoring will stop the monitor event scripts
+ from running but node health checks will still occur
+ */
+ void ctdb_disable_monitoring(struct ctdb_context *ctdb)
+ {
+ ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
- DEBUG(1,("Monitoring has been disabled\n"));
++ DEBUG(2,("Monitoring has been disabled\n"));
+ }
+
+ /*
+ Re-enable running monitor events after they have been disabled
+ */
+ void ctdb_enable_monitoring(struct ctdb_context *ctdb)
+ {
+ ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
- DEBUG(1,("Monitoring has been enabled\n"));
++ DEBUG(2,("Monitoring has been enabled\n"));
+ }
+
+ /* stop any monitoring
+ this should only be done when shutting down the daemon
+ */
void ctdb_stop_monitoring(struct ctdb_context *ctdb)
{
talloc_free(ctdb->monitor_context);
node->flags &= ~m->clear;
if (node->flags == old_flags) {
- /* no change */
- DEBUG(0, ("Control modflags on node %u - Unchanged - flags 0x%x\n", ctdb->pnn, node->flags));
++ DEBUG(2, ("Control modflags on node %u - Unchanged - flags 0x%x\n", ctdb->pnn, node->flags));
return 0;
}
{
struct ctdb_set_recmode_state *state = talloc_get_type(p, struct ctdb_set_recmode_state);
- ctdb_start_monitoring(ctdb);
++ ctdb_enable_monitoring(state->ctdb);
+
if (status == 0) {
ctdb->recovery_mode = state->recmode;
} else {
state,
ctdb_recovered_callback,
state, "recovered");
- ctdb_enable_monitoring(state->ctdb);
-
+
if (ret != 0) {
++ ctdb_enable_monitoring(state->ctdb);
++
ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "failed to run eventscript from set_recmode");
talloc_free(state);
return;
}
}
- /* either a new node is the culprit, or we've decide to forgive them */
+ /*
+ remember the trouble maker
+ */
+ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+ {
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ if (rec->last_culprit != culprit ||
+ timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+ DEBUG(0,("New recovery culprit %u\n", culprit));
++ /* either a new node is the culprit, or we've decided to forgive them */
+ rec->last_culprit = culprit;
+ rec->first_recover_time = timeval_current();
+ rec->culprit_counter = 0;
+ }
+ rec->culprit_counter++;
+ }
/*
- update our local flags from all remote connected nodes.
+ Update our local flags from all remote connected nodes.
+ This is only run when we are or we belive we are the recovery master
*/
- static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+ static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
{
int j;
+ struct ctdb_context *ctdb = rec->ctdb;
TALLOC_CTX *mem_ctx = talloc_new(ctdb);
/* get the nodemap for all active remote nodes and verify
char *ip = inet_ntoa(state->sin->sin_addr);
struct ctdb_tcp_array *tcparray;
- ctdb_start_monitoring(ctdb);
++ ctdb_enable_monitoring(ctdb);
+
if (status != 0) {
DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
ip, state->vnn->iface));
vnn->iface,
inet_ntoa(pip->sin.sin_addr),
vnn->public_netmask_bits);
- ctdb_enable_monitoring(ctdb);
+
if (ret != 0) {
++ ctdb_enable_monitoring(ctdb);
DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
inet_ntoa(pip->sin.sin_addr), vnn->iface));
talloc_free(state);
char *ip = inet_ntoa(state->sin->sin_addr);
TDB_DATA data;
- ctdb_start_monitoring(ctdb);
++ ctdb_enable_monitoring(ctdb);
+
/* send a message to all clients of this node telling them
that the cluster has been reconfigured and they should
release any sockets on this IP */
vnn->iface,
inet_ntoa(pip->sin.sin_addr),
vnn->public_netmask_bits);
- ctdb_enable_monitoring(ctdb);
-
if (ret != 0) {
++ ctdb_enable_monitoring(ctdb);
++
DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
inet_ntoa(pip->sin.sin_addr), vnn->iface));
talloc_free(state);