*/
struct ctdb_recoverd {
struct ctdb_context *ctdb;
- int rec_file_fd;
uint32_t recmaster;
uint32_t num_active;
uint32_t num_connected;
/*
run the "recovered" eventscript on all nodes
*/
-static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
{
TALLOC_CTX *tmp_ctx;
+ uint32_t *nodes;
tmp_ctx = talloc_new(ctdb);
CTDB_NO_MEMORY(ctdb, tmp_ctx);
+ nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
- list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
- CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event. Recovery failed.\n"));
+ nodes,
+ CONTROL_TIMEOUT(), false, tdb_null,
+ NULL, NULL,
+ NULL) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
+
talloc_free(tmp_ctx);
return -1;
}
return 0;
}
+/*
+ remember the trouble maker
+ */
+static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ if (rec->last_culprit != culprit ||
+ timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+ DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
+ /* either a new node is the culprit, or we've decided to forgive them */
+ rec->last_culprit = culprit;
+ rec->first_recover_time = timeval_current();
+ rec->culprit_counter = 0;
+ }
+ rec->culprit_counter++;
+}
+
+/*
+ remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ if (rec->last_culprit != culprit ||
+ timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+ DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
+ /* either a new node is the culprit, or we've decided to forgive them */
+ rec->last_culprit = culprit;
+ rec->first_recover_time = timeval_current();
+ rec->culprit_counter = 0;
+ }
+ rec->culprit_counter += count;
+}
+
+/* this callback is called for every node that failed to execute the
+ start recovery event
+*/
+static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+ DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
+
+ ctdb_set_culprit(rec, node_pnn);
+}
+
/*
run the "startrecovery" eventscript on all nodes
*/
-static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
{
TALLOC_CTX *tmp_ctx;
+ uint32_t *nodes;
+ struct ctdb_context *ctdb = rec->ctdb;
tmp_ctx = talloc_new(ctdb);
CTDB_NO_MEMORY(ctdb, tmp_ctx);
+ nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
- list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
- CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
+ nodes,
+ CONTROL_TIMEOUT(), false, tdb_null,
+ NULL,
+ startrecovery_fail_callback,
+ rec) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
return 0;
}
-static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata)
+static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
{
if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
- DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %d %p\n", outdata.dsize, outdata.dptr));
+ DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
return;
}
- ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
+ if (node_pnn < ctdb->num_nodes) {
+ ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
+ }
}
/*
CTDB_NO_MEMORY(ctdb, tmp_ctx);
nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
-
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
nodes, CONTROL_TIMEOUT(),
- false, tdb_null, async_getcap_callback) != 0) {
+ false, tdb_null,
+ async_getcap_callback, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
talloc_free(tmp_ctx);
return -1;
tmp_ctx = talloc_new(ctdb);
CTDB_NO_MEMORY(ctdb, tmp_ctx);
- nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
-
/* freeze all nodes */
+ nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
if (rec_mode == CTDB_RECOVERY_ACTIVE) {
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
nodes, CONTROL_TIMEOUT(),
- false, tdb_null, NULL) != 0) {
+ false, tdb_null,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
nodes, CONTROL_TIMEOUT(),
- false, data, NULL) != 0) {
+ false, data,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
}
- if (rec_mode == CTDB_RECOVERY_NORMAL) {
- if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
- nodes, CONTROL_TIMEOUT(),
- false, tdb_null, NULL) != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to thaw nodes. Recovery failed.\n"));
- talloc_free(tmp_ctx);
- return -1;
- }
- }
-
talloc_free(tmp_ctx);
return 0;
}
{
TDB_DATA data;
TALLOC_CTX *tmp_ctx;
+ uint32_t *nodes;
tmp_ctx = talloc_new(ctdb);
CTDB_NO_MEMORY(ctdb, tmp_ctx);
data.dsize = sizeof(uint32_t);
data.dptr = (unsigned char *)&pnn;
+ nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
- list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
- CONTROL_TIMEOUT(), false, data, NULL) != 0) {
+ nodes,
+ CONTROL_TIMEOUT(), false, data,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
{
int ret;
TDB_DATA outdata;
- struct ctdb_control_pulldb_reply *reply;
+ struct ctdb_marshall_buffer *reply;
struct ctdb_rec_data *rec;
int i;
TALLOC_CTX *tmp_ctx = talloc_new(recdb);
return -1;
}
- reply = (struct ctdb_control_pulldb_reply *)outdata.dptr;
+ reply = (struct ctdb_marshall_buffer *)outdata.dptr;
- if (outdata.dsize < offsetof(struct ctdb_control_pulldb_reply, data)) {
+ if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
talloc_free(tmp_ctx);
return -1;
/*
pull all the remote database contents into the recdb
*/
-static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+static int pull_remote_database(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec,
+ struct ctdb_node_map *nodemap,
struct tdb_wrap *recdb, uint32_t dbid)
{
int j;
if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
nodemap->nodes[j].pnn));
+ ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
return -1;
}
}
/*
update flags on all active nodes
*/
-static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
{
- int i;
- for (i=0;i<nodemap->num;i++) {
- struct ctdb_node_flag_change c;
- TDB_DATA data;
-
- c.pnn = nodemap->nodes[i].pnn;
- c.old_flags = nodemap->nodes[i].flags;
- c.new_flags = nodemap->nodes[i].flags;
-
- data.dptr = (uint8_t *)&c;
- data.dsize = sizeof(c);
-
- ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
- CTDB_SRVID_NODE_FLAGS_CHANGED, data);
+ int ret;
+ ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ return -1;
}
+
return 0;
}
-
/*
ensure all nodes have the same vnnmap we do
*/
struct ctdb_recoverd *rec;
uint32_t srcnode;
struct ctdb_db_context *ctdb_db;
- struct ctdb_control_pulldb_reply *recs;
+ struct ctdb_marshall_buffer *recs;
struct ctdb_rec_data *r;
};
*/
static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
{
- struct vacuum_info *v = talloc_get_type(state->async.private, struct vacuum_info);
+ struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
talloc_free(state);
vacuum_fetch_next(v);
}
return;
}
state->async.fn = vacuum_fetch_callback;
- state->async.private = v;
+ state->async.private_data = v;
return;
}
TDB_DATA data, void *private_data)
{
struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
- struct ctdb_control_pulldb_reply *recs;
+ struct ctdb_marshall_buffer *recs;
int ret, i;
TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
const char *name;
uint32_t srcnode;
struct vacuum_info *v;
- recs = (struct ctdb_control_pulldb_reply *)data.dptr;
+ recs = (struct ctdb_marshall_buffer *)data.dptr;
r = (struct ctdb_rec_data *)&recs->data[0];
if (recs->count == 0) {
+ talloc_free(tmp_ctx);
return;
}
for (v=rec->vacuum_info;v;v=v->next) {
if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
/* we're already working on records from this node */
+ talloc_free(tmp_ctx);
return;
}
}
}
/* attach to it */
- ctdb_db = ctdb_attach(ctdb, name, persistent);
+ ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
if (ctdb_db == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
talloc_free(tmp_ctx);
v = talloc_zero(rec, struct vacuum_info);
if (v == NULL) {
DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
+ talloc_free(tmp_ctx);
return;
}
if (v->recs == NULL) {
DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
talloc_free(v);
+ talloc_free(tmp_ctx);
return;
}
v->r = (struct ctdb_rec_data *)&v->recs->data[0];
talloc_set_destructor(v, vacuum_info_destructor);
vacuum_fetch_next(v);
+ talloc_free(tmp_ctx);
}
}
}
-/*
- remember the trouble maker
- */
-static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
-{
- struct ctdb_context *ctdb = rec->ctdb;
-
- if (rec->last_culprit != culprit ||
- timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
- DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
- /* either a new node is the culprit, or we've decided to forgive them */
- rec->last_culprit = culprit;
- rec->first_recover_time = timeval_current();
- rec->culprit_counter = 0;
- }
- rec->culprit_counter++;
-}
-
/*
Update our local flags from all remote connected nodes.
This is only run when we are or we belive we are the recovery master
return MONITOR_FAILED;
}
if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
- struct ctdb_node_flag_change c;
- TDB_DATA data;
+ int ban_changed = (nodemap->nodes[j].flags ^ remote_nodemap->nodes[j].flags) & NODE_FLAGS_BANNED;
+
+ if (ban_changed) {
+ DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
+ nodemap->nodes[j].pnn,
+ remote_nodemap->nodes[j].flags,
+ nodemap->nodes[j].flags));
+ }
/* We should tell our daemon about this so it
updates its flags or else we will log the same
Since we are the recovery master we can just as
well update the flags on all nodes.
*/
- c.pnn = nodemap->nodes[j].pnn;
- c.old_flags = nodemap->nodes[j].flags;
- c.new_flags = remote_nodemap->nodes[j].flags;
-
- data.dptr = (uint8_t *)&c;
- data.dsize = sizeof(c);
-
- ctdb_send_message(ctdb, ctdb->pnn,
- CTDB_SRVID_NODE_FLAGS_CHANGED,
- data);
+ ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ return -1;
+ }
/* Update our local copy of the flags in the recovery
daemon.
/* If the BANNED flag has changed for the node
this is a good reason to do a new election.
*/
- if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) {
- DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
- nodemap->nodes[j].pnn, c.new_flags,
- c.old_flags));
+ if (ban_changed) {
talloc_free(mem_ctx);
return MONITOR_ELECTION_NEEDED;
}
{
char *name;
struct tdb_wrap *recdb;
+ unsigned tdb_flags;
/* open up the temporary recovery database */
name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
return NULL;
}
unlink(name);
+
+ tdb_flags = TDB_NOLOCK;
+ if (!ctdb->do_setsched) {
+ tdb_flags |= TDB_NOMMAP;
+ }
+
recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
- TDB_NOLOCK, O_RDWR|O_CREAT|O_EXCL, 0600);
+ tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
if (recdb == NULL) {
DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
}
*/
struct recdb_data {
struct ctdb_context *ctdb;
- struct ctdb_control_pulldb_reply *recdata;
+ struct ctdb_marshall_buffer *recdata;
uint32_t len;
bool failed;
};
struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
{
struct recdb_data params;
- struct ctdb_control_pulldb_reply *recdata;
+ struct ctdb_marshall_buffer *recdata;
TDB_DATA outdata;
TALLOC_CTX *tmp_ctx;
+ uint32_t *nodes;
tmp_ctx = talloc_new(ctdb);
CTDB_NO_MEMORY(ctdb, tmp_ctx);
- recdata = talloc_zero(recdb, struct ctdb_control_pulldb_reply);
+ recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
CTDB_NO_MEMORY(ctdb, recdata);
recdata->db_id = dbid;
params.ctdb = ctdb;
params.recdata = recdata;
- params.len = offsetof(struct ctdb_control_pulldb_reply, data);
+ params.len = offsetof(struct ctdb_marshall_buffer, data);
params.failed = false;
if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
outdata.dptr = (void *)recdata;
outdata.dsize = params.len;
+ nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
- list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
- CONTROL_TIMEOUT(), false, outdata, NULL) != 0) {
+ nodes,
+ CONTROL_TIMEOUT(), false, outdata,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
talloc_free(recdata);
talloc_free(tmp_ctx);
struct ctdb_context *ctdb = rec->ctdb;
TDB_DATA data;
struct ctdb_control_wipe_database w;
+ uint32_t *nodes;
recdb = create_recdb(ctdb, mem_ctx);
if (recdb == NULL) {
}
/* pull all remote databases onto the recdb */
- ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
+ ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
return -1;
data.dptr = (void *)&w;
data.dsize = sizeof(w);
+ nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
- list_of_active_nodes(ctdb, nodemap, recdb, true),
- CONTROL_TIMEOUT(), false, data, NULL) != 0) {
+ nodes,
+ CONTROL_TIMEOUT(), false, data,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
talloc_free(recdb);
return -1;
return 0;
}
-
+/*
+ reload the nodes file
+*/
+static void reload_nodes_file(struct ctdb_context *ctdb)
+{
+ ctdb->nodes = NULL;
+ ctdb_load_nodes_file(ctdb);
+}
+
+
/*
we are the recmaster, and recovery is needed - start a recovery run
*/
static int do_recovery(struct ctdb_recoverd *rec,
TALLOC_CTX *mem_ctx, uint32_t pnn,
struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
- uint32_t culprit)
+ int32_t culprit)
{
struct ctdb_context *ctdb = rec->ctdb;
int i, j, ret;
uint32_t generation;
struct ctdb_dbid_map *dbmap;
TDB_DATA data;
+ uint32_t *nodes;
DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+ reload_nodes_file(ctdb);
+ return -1;
+ }
+
/* if recovery fails, force it again */
rec->need_recovery = true;
- ctdb_set_culprit(rec, culprit);
+ if (culprit != -1) {
+ ctdb_set_culprit(rec, culprit);
+ }
if (rec->culprit_counter > 2*nodemap->num) {
DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
/* set recovery mode to active on all nodes */
ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
- if (ret!=0) {
+ if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
return -1;
}
/* execute the "startrecovery" event script on all nodes */
- ret = run_startrecovery_eventscript(ctdb, nodemap);
+ ret = run_startrecovery_eventscript(rec, nodemap);
if (ret!=0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
return -1;
data.dptr = (void *)&generation;
data.dsize = sizeof(uint32_t);
+ nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
- list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
- CONTROL_TIMEOUT(), false, data, NULL) != 0) {
+ nodes,
+ CONTROL_TIMEOUT(), false, data,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
return -1;
}
/* commit all the changes */
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
- list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
- CONTROL_TIMEOUT(), false, data, NULL) != 0) {
+ nodes,
+ CONTROL_TIMEOUT(), false, data,
+ NULL, NULL,
+ NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
return -1;
}
/*
update all nodes to have the same flags that we have
*/
- ret = update_flags_on_all_nodes(ctdb, nodemap);
+ for (i=0;i<nodemap->num;i++) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+ return -1;
+ }
+ }
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
+ /* disable recovery mode */
+ ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
return -1;
}
-
- DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
/*
tell nodes to takeover their public IPs
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
/* execute the "recovered" event script on all nodes */
- ret = run_recovered_eventscript(ctdb, nodemap);
+ ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
if (ret!=0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
return -1;
}
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
- /* disable recovery mode */
- ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
- if (ret!=0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
- return -1;
- }
-
- DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
-
/* send a message to all clients telling them that the cluster
has been reconfigured */
ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
/*
send out an election request
*/
-static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
+static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
{
int ret;
TDB_DATA election_data;
election_data.dptr = (unsigned char *)&emsg;
- /* first we assume we will win the election and set
- recoverymaster to be ourself on the current node
- */
- ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
- return -1;
- }
-
-
/* send an election message to all active nodes */
ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+
+ /* A new node that is already frozen has entered the cluster.
+ The existing nodes are not frozen and dont need to be frozen
+ until the election has ended and we start the actual recovery
+ */
+ if (update_recmaster == true) {
+ /* first we assume we will win the election and set
+ recoverymaster to be ourself on the current node
+ */
+ ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
+ return -1;
+ }
+ }
+
+
return 0;
}
struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
int ret;
- ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
+ ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
}
if (data.dsize != sizeof(struct rd_memdump_reply)) {
DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+ talloc_free(tmp_ctx);
return;
}
rd = (struct rd_memdump_reply *)data.dptr;
ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
+ talloc_free(tmp_ctx);
return;
}
/* set all nodes to recovery mode to stop all internode traffic */
ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
- if (ret!=0) {
+ if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
return;
}
timeval_current_ofs(ctdb->tunable.election_timeout, 0),
ctdb_election_timeout, rec);
- ret = send_election_request(rec, pnn);
+ ret = send_election_request(rec, pnn, true);
if (ret!=0) {
DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
return;
changed_flags = c->old_flags ^ c->new_flags;
- /* Dont let messages from remote nodes change the DISCONNECTED flag.
- This flag is handled locally based on whether the local node
- can communicate with the node or not.
- */
- c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
- if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
- c->new_flags |= NODE_FLAGS_DISCONNECTED;
- }
-
if (nodemap->nodes[i].flags != c->new_flags) {
DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
}
if (ret == 0 &&
ctdb->recovery_master == ctdb->pnn &&
- ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
- ctdb->vnn) {
+ ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
/* Only do the takeover run if the perm disabled or unhealthy
flags changed since these will cause an ip failover but not
a recovery.
talloc_free(tmp_ctx);
}
+/*
+ handler for when we need to push out flag changes ot all other nodes
+*/
+static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
+ TDB_DATA data, void *private_data)
+{
+ int ret;
+ struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+
+ ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ }
+}
struct verify_recmode_normal_data {
return status;
}
-/*
- this function writes the number of connected nodes we have for this pnn
- to the pnn slot in the reclock file
+
+/* called to check that the allocation of public ip addresses is ok.
*/
-static void
-ctdb_recoverd_write_pnn_connect_count(struct ctdb_recoverd *rec)
+static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
{
- const char count = rec->num_connected;
- struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
-
- if (rec->rec_file_fd == -1) {
- DEBUG(DEBUG_CRIT,(__location__ " Unable to write pnn count. pnnfile is not open.\n"));
- return;
- }
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ struct ctdb_all_public_ips *ips = NULL;
+ struct ctdb_uptime *uptime1 = NULL;
+ struct ctdb_uptime *uptime2 = NULL;
+ int ret, j;
+
+ ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE, &uptime1);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
+ talloc_free(mem_ctx);
+ return -1;
+ }
- if (pwrite(rec->rec_file_fd, &count, 1, ctdb->pnn) == -1) {
- DEBUG(DEBUG_CRIT, (__location__ " Failed to write pnn count\n"));
- close(rec->rec_file_fd);
- rec->rec_file_fd = -1;
+ /* read the ip allocation from the local node */
+ ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
+ talloc_free(mem_ctx);
+ return -1;
}
-}
-/*
- this function opens the reclock file and sets a byterage lock for the single
- byte at position pnn+1.
- the existence/non-existence of such a lock provides an alternative mechanism
- to know whether a remote node(recovery daemon) is running or not.
-*/
-static void
-ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
-{
- struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
- struct flock lock;
- char *pnnfile = NULL;
+ ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE, &uptime2);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
+ talloc_free(mem_ctx);
+ return -1;
+ }
- DEBUG(DEBUG_INFO, ("Setting PNN lock for pnn:%d\n", ctdb->pnn));
+ /* skip the check if the startrecovery time has changed */
+ if (timeval_compare(&uptime1->last_recovery_started,
+ &uptime2->last_recovery_started) != 0) {
+ DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
+ talloc_free(mem_ctx);
+ return 0;
+ }
- if (rec->rec_file_fd != -1) {
- close(rec->rec_file_fd);
- rec->rec_file_fd = -1;
+ /* skip the check if the endrecovery time has changed */
+ if (timeval_compare(&uptime1->last_recovery_finished,
+ &uptime2->last_recovery_finished) != 0) {
+ DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
+ talloc_free(mem_ctx);
+ return 0;
}
- pnnfile = talloc_asprintf(rec, "%s.pnn", ctdb->recovery_lock_file);
- CTDB_NO_MEMORY_FATAL(ctdb, pnnfile);
+ /* skip the check if we have started but not finished recovery */
+ if (timeval_compare(&uptime1->last_recovery_finished,
+ &uptime1->last_recovery_started) != 1) {
+ DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
+ talloc_free(mem_ctx);
- rec->rec_file_fd = open(pnnfile, O_RDWR|O_CREAT, 0600);
- if (rec->rec_file_fd == -1) {
- DEBUG(DEBUG_CRIT,(__location__ " Unable to open %s - (%s)\n",
- pnnfile, strerror(errno)));
- talloc_free(pnnfile);
- return;
+ return 0;
}
- set_close_on_exec(rec->rec_file_fd);
- lock.l_type = F_WRLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = ctdb->pnn;
- lock.l_len = 1;
- lock.l_pid = 0;
+ /* verify that we have the ip addresses we should have
+ and we dont have ones we shouldnt have.
+ if we find an inconsistency we set recmode to
+ active on the local node and wait for the recmaster
+ to do a full blown recovery
+ */
+ for (j=0; j<ips->num; j++) {
+ if (ips->ips[j].pnn == pnn) {
+ if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+
+ talloc_free(mem_ctx);
+ return -1;
+ }
+ ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
- if (fcntl(rec->rec_file_fd, F_SETLK, &lock) != 0) {
- close(rec->rec_file_fd);
- rec->rec_file_fd = -1;
- DEBUG(DEBUG_CRIT,(__location__ " Failed to get pnn lock on '%s'\n", pnnfile));
- talloc_free(pnnfile);
- return;
- }
+ talloc_free(mem_ctx);
+ return -1;
+ }
+ }
+ } else {
+ if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
- DEBUG(DEBUG_NOTICE,(__location__ " Got pnn lock on '%s'\n", pnnfile));
- talloc_free(pnnfile);
+ talloc_free(mem_ctx);
+ return -1;
+ }
+ ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
- /* we start out with 0 connected nodes */
- ctdb_recoverd_write_pnn_connect_count(rec);
+ talloc_free(mem_ctx);
+ return -1;
+ }
+ }
+ }
+ }
+
+ talloc_free(mem_ctx);
+ return 0;
}
-/*
- called when we need to do the periodical reclock pnn count update
- */
-static void ctdb_update_pnn_count(struct event_context *ev, struct timed_event *te,
- struct timeval t, void *p)
+
+static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
{
- int i, count;
- struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
- struct ctdb_context *ctdb = rec->ctdb;
- struct ctdb_node_map *nodemap = rec->nodemap;
+ struct ctdb_node_map **remote_nodemaps = callback_data;
- /* close and reopen the pnn lock file */
- ctdb_recoverd_get_pnn_lock(rec);
+ if (node_pnn >= ctdb->num_nodes) {
+ DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
+ return;
+ }
- ctdb_recoverd_write_pnn_connect_count(rec);
+ remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
- event_add_timed(rec->ctdb->ev, rec->ctdb,
- timeval_current_ofs(ctdb->tunable.reclock_ping_period, 0),
- ctdb_update_pnn_count, rec);
+}
- /* check if there is a split cluster and yeld the recmaster role
- it the other half of the cluster is larger
- */
- DEBUG(DEBUG_DEBUG, ("CHECK FOR SPLIT CLUSTER\n"));
- if (rec->nodemap == NULL) {
- return;
- }
- if (rec->rec_file_fd == -1) {
- return;
- }
- /* only test this if we think we are the recmaster */
- if (ctdb->pnn != rec->recmaster) {
- DEBUG(DEBUG_DEBUG, ("We are not recmaster, skip test\n"));
- return;
- }
- if (ctdb->recovery_lock_fd == -1) {
- DEBUG(DEBUG_ERR, (__location__ " Lost reclock pnn file. Yielding recmaster role\n"));
- close(ctdb->recovery_lock_fd);
- ctdb->recovery_lock_fd = -1;
- force_election(rec, ctdb->pnn, rec->nodemap);
- return;
- }
- for (i=0; i<nodemap->num; i++) {
- /* we dont need to check ourself */
- if (nodemap->nodes[i].pnn == ctdb->pnn) {
- continue;
- }
- /* dont check nodes that are connected to us */
- if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
- continue;
- }
- /* check if the node is "connected" and how connected it it */
- count = ctdb_read_pnn_lock(rec->rec_file_fd, nodemap->nodes[i].pnn);
- if (count < 0) {
- continue;
- }
- /* check if that node is more connected that us */
- if (count > rec->num_connected) {
- DEBUG(DEBUG_ERR, ("DISCONNECTED Node %u is more connected than we are, yielding recmaster role\n", nodemap->nodes[i].pnn));
- close(ctdb->recovery_lock_fd);
- ctdb->recovery_lock_fd = -1;
- force_election(rec, ctdb->pnn, rec->nodemap);
- return;
- }
+static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+ struct ctdb_node_map *nodemap,
+ struct ctdb_node_map **remote_nodemaps)
+{
+ uint32_t *nodes;
+
+ nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
+ nodes,
+ CONTROL_TIMEOUT(), false, tdb_null,
+ async_getnodemap_callback,
+ NULL,
+ remote_nodemaps) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
+
+ return -1;
}
+
+ return 0;
}
/*
uint32_t pnn;
TALLOC_CTX *mem_ctx=NULL;
struct ctdb_node_map *nodemap=NULL;
- struct ctdb_node_map *remote_nodemap=NULL;
+ struct ctdb_node_map *recmaster_nodemap=NULL;
+ struct ctdb_node_map **remote_nodemaps=NULL;
struct ctdb_vnn_map *vnnmap=NULL;
struct ctdb_vnn_map *remote_vnnmap=NULL;
int32_t debug_level;
int i, j, ret;
struct ctdb_recoverd *rec;
- struct ctdb_all_public_ips *ips;
char c;
DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
rec->priority_time = timeval_current();
- /* open the rec file fd and lock our slot */
- rec->rec_file_fd = -1;
- ctdb_recoverd_get_pnn_lock(rec);
-
/* register a message port for sending memory dumps */
ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
/* register a message port for recovery elections */
ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
- /* and one for when nodes are disabled/enabled */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
+ /* when nodes are disabled/enabled */
+ ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
+
+ /* when we are asked to puch out a flag change */
+ ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
- /* and one for when nodes are banned */
+ /* when nodes are banned */
ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
/* and one for when nodes are unbanned */
/* register a message port for vacuum fetch */
ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
- /* update the reclock pnn file connected count on a regular basis */
- event_add_timed(ctdb->ev, ctdb,
- timeval_current_ofs(ctdb->tunable.reclock_ping_period, 0),
- ctdb_update_pnn_count, rec);
-
again:
if (mem_ctx) {
talloc_free(mem_ctx);
exit(-1);
}
+ /* ping the local daemon to tell it we are alive */
+ ctdb_ctrl_recd_ping(ctdb);
+
if (rec->election_timeout) {
/* an election is in progress */
goto again;
/* grap the nodemap from the recovery master to check if it is banned */
ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
- mem_ctx, &remote_nodemap);
+ mem_ctx, &recmaster_nodemap);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
nodemap->nodes[j].pnn));
}
- if (remote_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
force_election(rec, pnn, nodemap);
goto again;
}
- /* verify that the public ip address allocation is consistent */
- if (ctdb->vnn != NULL) {
- ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, ("Unable to get public ips from node %u\n", i));
+
+ /* verify that we have all ip addresses we should have and we dont
+ * have addresses we shouldnt have.
+ */
+ if (ctdb->do_checkpublicip) {
+ if (verify_ip_allocation(ctdb, pnn) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
goto again;
}
- for (j=0; j<ips->num; j++) {
- /* verify that we have the ip addresses we should have
- and we dont have ones we shouldnt have.
- if we find an inconsistency we set recmode to
- active on the local node and wait for the recmaster
- to do a full blown recovery
- */
- if (ips->ips[j].pnn == pnn) {
- if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
- DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
- ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
- goto again;
- }
- ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
- goto again;
- }
- }
- } else {
- if (ctdb_sys_have_ip(ips->ips[j].sin)) {
- DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
- ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
- goto again;
- }
- ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
- goto again;
- }
- }
- }
- }
}
+
/* if we are not the recmaster then we do not need to check
if recovery is needed
*/
/* update the list of public ips that a node can handle for
all connected nodes
*/
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+ reload_nodes_file(ctdb);
+ goto again;
+ }
for (j=0; j<nodemap->num; j++) {
if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
continue;
if (rec->need_recovery) {
/* a previous recovery didn't finish */
- do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
+ do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, -1);
goto again;
}
goto again;
}
- /* get the nodemap for all active remote nodes and verify
- they are the same as for this node
+
+ /* get the nodemap for all active remote nodes
*/
+ remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
+ if (remote_nodemaps == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
+ goto again;
+ }
+ for(i=0; i<nodemap->num; i++) {
+ remote_nodemaps[i] = NULL;
+ }
+ if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
+ goto again;
+ }
+
+ /* verify that all other nodes have the same nodemap as we have
+ */
for (j=0; j<nodemap->num; j++) {
if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
continue;
}
- if (nodemap->nodes[j].pnn == pnn) {
- continue;
- }
- ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
- mem_ctx, &remote_nodemap);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
- nodemap->nodes[j].pnn));
+ if (remote_nodemaps[j] == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
+ ctdb_set_culprit(rec, j);
+
goto again;
}
- /* if the nodes disagree on how many nodes there are
+ /* if the nodes disagree on how many nodes there are
then this is a good reason to try recovery
*/
- if (remote_nodemap->num != nodemap->num) {
+ if (remote_nodemaps[j]->num != nodemap->num) {
DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
- nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
+ nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
goto again;
}
active, then that is also a good reason to do recovery
*/
for (i=0;i<nodemap->num;i++) {
- if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
+ if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
nodemap->nodes[j].pnn, i,
- remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
- do_recovery(rec, mem_ctx, pnn, nodemap,
- vnnmap, nodemap->nodes[j].pnn);
- goto again;
- }
- if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
- (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
- DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
- nodemap->nodes[j].pnn, i,
- remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
+ remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap, nodemap->nodes[j].pnn);
goto again;
}
}
+ /* verify the flags are consistent
+ */
+ for (i=0; i<nodemap->num; i++) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
+ nodemap->nodes[j].pnn,
+ nodemap->nodes[i].pnn,
+ remote_nodemaps[j]->nodes[i].flags,
+ nodemap->nodes[j].flags));
+ if (i == j) {
+ DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
+ update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
+ do_recovery(rec, mem_ctx, pnn, nodemap,
+ vnnmap, nodemap->nodes[j].pnn);
+ goto again;
+ } else {
+ DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
+ update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
+ do_recovery(rec, mem_ctx, pnn, nodemap,
+ vnnmap, nodemap->nodes[j].pnn);
+ goto again;
+ }
+ }
+ }
}
rec->need_takeover_run = false;
/* execute the "startrecovery" event script on all nodes */
- ret = run_startrecovery_eventscript(ctdb, nodemap);
+ ret = run_startrecovery_eventscript(rec, nodemap);
if (ret!=0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
do_recovery(rec, mem_ctx, pnn, nodemap,
}
/* execute the "recovered" event script on all nodes */
- ret = run_recovered_eventscript(ctdb, nodemap);
+ ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
+#if 0
+// we cant check whether the event completed successfully
+// since this script WILL fail if the node is in recovery mode
+// and if that race happens, the code here would just cause a second
+// cascading recovery.
if (ret!=0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap, ctdb->pnn);
}
+#endif
}
+
goto again;
}
{
struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
- /* make sure we harvest the child if signals are blocked for some
- reason
- */
- waitpid(ctdb->recoverd_pid, 0, WNOHANG);
-
if (kill(ctdb->recoverd_pid, 0) != 0) {
DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
ctdb_check_recd, ctdb);
}
+static void recd_sig_child_handler(struct event_context *ev,
+ struct signal_event *se, int signum, int count,
+ void *dont_care,
+ void *private_data)
+{
+// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ int status;
+ pid_t pid = -1;
+
+ while (pid != 0) {
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid == -1) {
+ DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
+ return;
+ }
+ if (pid > 0) {
+ DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
+ }
+ }
+}
+
/*
startup the recovery daemon as a child of the main ctdb daemon
*/
{
int ret;
int fd[2];
+ struct signal_event *se;
if (pipe(fd) != 0) {
return -1;
exit(1);
}
+ /* set up a handler to pick up sigchld */
+ se = event_add_signal(ctdb->ev, ctdb,
+ SIGCHLD, 0,
+ recd_sig_child_handler,
+ ctdb);
+ if (se == NULL) {
+ DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
+ exit(1);
+ }
+
monitor_cluster(ctdb);
DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));