*/
#include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
#include "system/filesys.h"
#include "system/time.h"
#include "system/network.h"
#include "system/wait.h"
#include "popt.h"
#include "cmdline.h"
-#include "../include/ctdb.h"
+#include "../include/ctdb_client.h"
#include "../include/ctdb_private.h"
#include "db_wrap.h"
#include "dlinklist.h"
TALLOC_CTX *ip_reallocate_ctx;
struct ip_reallocate_list *reallocate_callers;
TALLOC_CTX *ip_check_disable_ctx;
+ struct ctdb_control_get_ifaces *ifaces;
};
#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
{
if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
- DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
+ DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
return;
}
if (node_pnn < ctdb->num_nodes) {
/*
wait for a given number of seconds
*/
-static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
{
uint32_t timed_out = 0;
- event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
+ time_t usecs = (secs - (time_t)secs) * 1000000;
+ event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
while (!timed_out) {
event_loop_once(ctdb->ev);
}
{
struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
rec->election_timeout = NULL;
+ fast_start = false;
DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
}
ctdb_load_nodes_file(ctdb);
}
-
+static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec,
+ struct ctdb_node_map *nodemap,
+ uint32_t *culprit)
+{
+ int j;
+ int ret;
+
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
+ ctdb->num_nodes, nodemap->num));
+ if (culprit) {
+ *culprit = ctdb->pnn;
+ }
+ return -1;
+ }
+
+ for (j=0; j<nodemap->num; j++) {
+ /* release any existing data */
+ if (ctdb->nodes[j]->known_public_ips) {
+ talloc_free(ctdb->nodes[j]->known_public_ips);
+ ctdb->nodes[j]->known_public_ips = NULL;
+ }
+ if (ctdb->nodes[j]->available_public_ips) {
+ talloc_free(ctdb->nodes[j]->available_public_ips);
+ ctdb->nodes[j]->available_public_ips = NULL;
+ }
+
+ if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+
+ /* grab a new shiny list of public ips from the node */
+ ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+ CONTROL_TIMEOUT(),
+ ctdb->nodes[j]->pnn,
+ ctdb->nodes,
+ 0,
+ &ctdb->nodes[j]->known_public_ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
+ ctdb->nodes[j]->pnn));
+ if (culprit) {
+ *culprit = ctdb->nodes[j]->pnn;
+ }
+ return -1;
+ }
+
+ if (ctdb->tunable.disable_ip_failover == 0) {
+ if (rec->ip_check_disable_ctx == NULL) {
+ if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
+ DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
+ rec->need_takeover_run = true;
+ }
+ }
+ }
+
+ /* grab a new shiny list of public ips from the node */
+ ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+ CONTROL_TIMEOUT(),
+ ctdb->nodes[j]->pnn,
+ ctdb->nodes,
+ CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
+ &ctdb->nodes[j]->available_public_ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
+ ctdb->nodes[j]->pnn));
+ if (culprit) {
+ *culprit = ctdb->nodes[j]->pnn;
+ }
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* when we start a recovery, make sure all nodes use the same reclock file
+ setting
+*/
+static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ TDB_DATA data;
+ uint32_t *nodes;
+
+ if (ctdb->recovery_lock_file == NULL) {
+ data.dptr = NULL;
+ data.dsize = 0;
+ } else {
+ data.dsize = strlen(ctdb->recovery_lock_file) + 1;
+ data.dptr = (uint8_t *)ctdb->recovery_lock_file;
+ }
+
+ nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
+ nodes, 0,
+ CONTROL_TIMEOUT(),
+ false, data,
+ NULL, NULL,
+ rec) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+
/*
we are the recmaster, and recovery is needed - start a recovery run
*/
TDB_DATA data;
uint32_t *nodes;
struct timeval start_time;
+ uint32_t culprit = (uint32_t)-1;
DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
return -1;
}
ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
- DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
+ DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
}
DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
+ /* update all other nodes to use the same setting for reclock files
+ as the local recovery master.
+ */
+ sync_recovery_lock_file_across_cluster(rec);
+
/* set recovery mode to active on all nodes */
ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
if (ret != 0) {
/*
tell nodes to takeover their public IPs
*/
+ ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ return -1;
+ }
rec->need_takeover_run = false;
ret = ctdb_takeover_run(ctdb, nodemap);
if (ret != 0) {
/* send a message to all clients telling them that the cluster
has been reconfigured */
- ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
+ ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
We now wait for rerecovery_timeout before we allow
another recovery to take place.
*/
- DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
+ DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
- DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
+ DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
return 0;
}
/* send an election message to all active nodes */
DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
- ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+ ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
/* A new node that is already frozen has entered the cluster.
DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
- ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+ ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
talloc_free(tmp_ctx);
rec->ip_check_disable_ctx = NULL;
}
+
+static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
+ TDB_DATA data, void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+ struct ctdb_public_ip *ip;
+
+ if (rec->recmaster != rec->ctdb->pnn) {
+ DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
+ return;
+ }
+
+ if (data.dsize != sizeof(struct ctdb_public_ip)) {
+ DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
+ return;
+ }
+
+ ip = (struct ctdb_public_ip *)data.dptr;
+
+ update_ip_assignment_tree(rec->ctdb, ip);
+}
+
+
static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
TDB_DATA data, void *private_data)
{
TDB_DATA result;
int32_t ret;
struct ip_reallocate_list *callers;
+ uint32_t culprit;
DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
- ret = ctdb_takeover_run(ctdb, rec->nodemap);
+
+ /* update the list of public ips that a node can handle for
+ all connected nodes
+ */
+ ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ rec->need_takeover_run = true;
+ }
+ if (ret == 0) {
+ ret = ctdb_takeover_run(ctdb, rec->nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ rec->need_takeover_run = true;
+ }
+ }
+
result.dsize = sizeof(int32_t);
result.dptr = (uint8_t *)&ret;
DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
"%u:%llu\n", (unsigned)callers->rd->pnn,
(unsigned long long)callers->rd->srvid));
- ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
+ ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
"message to %u:%llu\n",
/* we got an election packet - update the timeout for the election */
talloc_free(rec->election_timeout);
rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
+ fast_start ?
+ timeval_current_ofs(0, 500000) :
timeval_current_ofs(ctdb->tunable.election_timeout, 0),
ctdb_election_timeout, rec);
talloc_free(rec->election_timeout);
rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
+ fast_start ?
+ timeval_current_ofs(0, 500000) :
timeval_current_ofs(ctdb->tunable.election_timeout, 0),
ctdb_election_timeout, rec);
}
-/* called to check that the allocation of public ip addresses is ok.
+/* called to check that the local allocation of public ip addresses is ok.
*/
-static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
+static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
{
TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ struct ctdb_control_get_ifaces *ifaces = NULL;
struct ctdb_all_public_ips *ips = NULL;
struct ctdb_uptime *uptime1 = NULL;
struct ctdb_uptime *uptime2 = NULL;
int ret, j;
+ bool need_iface_check = false;
+ bool need_takeover_run = false;
ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
CTDB_CURRENT_NODE, &uptime1);
return -1;
}
+
+ /* read the interfaces from the local node */
+ ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
+ talloc_free(mem_ctx);
+ return -1;
+ }
+
+ if (!rec->ifaces) {
+ need_iface_check = true;
+ } else if (rec->ifaces->num != ifaces->num) {
+ need_iface_check = true;
+ } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
+ need_iface_check = true;
+ }
+
+ if (need_iface_check) {
+ DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
+ "local node %u - force takeover run\n",
+ pnn));
+ need_takeover_run = true;
+ }
+
/* read the ip allocation from the local node */
ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
if (ret != 0) {
return 0;
}
+ talloc_free(rec->ifaces);
+ rec->ifaces = talloc_steal(rec, ifaces);
+
/* verify that we have the ip addresses we should have
and we dont have ones we shouldnt have.
if we find an inconsistency we set recmode to
active on the local node and wait for the recmaster
- to do a full blown recovery
+ to do a full blown recovery.
+ also if the pnn is -1 and we are healthy and can host the ip
+ we also request a ip reallocation.
*/
- for (j=0; j<ips->num; j++) {
- if (ips->ips[j].pnn == pnn) {
- if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
- struct takeover_run_reply rd;
- TDB_DATA data;
-
- DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
- ctdb_addr_to_str(&ips->ips[j].addr)));
-
- rd.pnn = ctdb->pnn;
- rd.srvid = 0;
- data.dptr = (uint8_t *)&rd;
- data.dsize = sizeof(rd);
-
- ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+ if (ctdb->tunable.disable_ip_failover == 0) {
+ for (j=0; j<ips->num; j++) {
+ if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
+ DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
+ } else if (ips->ips[j].pnn == pnn) {
+ if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
}
- }
- } else {
- if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
- struct takeover_run_reply rd;
- TDB_DATA data;
-
- DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
- ctdb_addr_to_str(&ips->ips[j].addr)));
-
- rd.pnn = ctdb->pnn;
- rd.srvid = 0;
- data.dptr = (uint8_t *)&rd;
- data.dsize = sizeof(rd);
-
- ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+ } else {
+ if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
}
}
}
}
+ if (need_takeover_run) {
+ struct takeover_run_reply rd;
+ TDB_DATA data;
+
+ DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
+
+ rd.pnn = ctdb->pnn;
+ rd.srvid = 0;
+ data.dptr = (uint8_t *)&rd;
+ data.dsize = sizeof(rd);
+
+ ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+ }
+ }
talloc_free(mem_ctx);
return 0;
}
close(state->fd[0]);
state->fd[0] = -1;
+ debug_extra = talloc_asprintf(NULL, "recovery-lock:");
if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
cc = RECLOCK_FAILED;
}
state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
- EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+ EVENT_FD_READ,
reclock_child_handler,
(void *)state);
talloc_free(state);
return -1;
}
+ tevent_fd_set_auto_close(state->fde);
while (state->status == RECLOCK_CHECKING) {
event_loop_once(ctdb->ev);
talloc_free(tmp_ctx);
return 0;
}
-
-/*
- the main monitoring loop
- */
-static void monitor_cluster(struct ctdb_context *ctdb)
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+ TALLOC_CTX *mem_ctx)
{
uint32_t pnn;
- TALLOC_CTX *mem_ctx=NULL;
struct ctdb_node_map *nodemap=NULL;
struct ctdb_node_map *recmaster_nodemap=NULL;
struct ctdb_node_map **remote_nodemaps=NULL;
struct ctdb_vnn_map *remote_vnnmap=NULL;
int32_t debug_level;
int i, j, ret;
- struct ctdb_recoverd *rec;
-
- DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
- rec = talloc_zero(ctdb, struct ctdb_recoverd);
- CTDB_NO_MEMORY_FATAL(ctdb, rec);
-
- rec->ctdb = ctdb;
-
- rec->priority_time = timeval_current();
-
- /* register a message port for sending memory dumps */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
-
- /* register a message port for recovery elections */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
-
- /* when nodes are disabled/enabled */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
-
- /* when we are asked to puch out a flag change */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
-
- /* register a message port for vacuum fetch */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
- /* register a message port for reloadnodes */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
-
- /* register a message port for performing a takeover run */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
-
- /* register a message port for disabling the ip check for a short while */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
-
-again:
- if (mem_ctx) {
- talloc_free(mem_ctx);
- mem_ctx = NULL;
- }
- mem_ctx = talloc_new(ctdb);
- if (!mem_ctx) {
- DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
- exit(-1);
- }
-
- /* we only check for recovery once every second */
- ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
/* verify that the main daemon is still running */
if (kill(ctdb->ctdbd_pid, 0) != 0) {
if (rec->election_timeout) {
/* an election is in progress */
- goto again;
+ return;
}
/* read the debug level from the parent and update locally */
ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
if (ret !=0) {
DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
- goto again;
+ return;
}
LogLevel = debug_level;
ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
- goto again;
+ return;
}
/* get the current recovery lock file from the server */
if (update_recovery_lock_file(ctdb) != 0) {
DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
- goto again;
+ return;
}
/* Make sure that if recovery lock verification becomes disabled when
pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
if (pnn == (uint32_t)-1) {
DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
- goto again;
+ return;
}
/* get the vnnmap */
ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
- goto again;
+ return;
}
ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
- goto again;
+ return;
}
nodemap = rec->nodemap;
ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
- goto again;
+ return;
}
/* if we are not the recmaster we can safely ignore any ip reallocate requests */
if (rec->recmaster == (uint32_t)-1) {
DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
if (ret != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
- goto again;
+ return;
}
ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
if (ret != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
- goto again;
+ return;
}
- goto again;
+ return;
}
}
/* If the local node is stopped, verify we are not the recmaster
if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* check that we (recovery daemon) and the local ctdb daemon
if (j == nodemap->num) {
DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* if recovery master is disconnected we must elect a new recmaster */
if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* grap the nodemap from the recovery master to check if it is banned */
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
nodemap->nodes[j].pnn));
- goto again;
+ return;
}
if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* verify that we have all ip addresses we should have and we dont
* have addresses we shouldnt have.
*/
- if (ctdb->do_checkpublicip) {
+ if (ctdb->tunable.disable_ip_failover == 0) {
if (rec->ip_check_disable_ctx == NULL) {
- if (verify_ip_allocation(ctdb, rec, pnn) != 0) {
+ if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
}
}
if recovery is needed
*/
if (pnn != rec->recmaster) {
- goto again;
+ return;
}
if (ret == MONITOR_ELECTION_NEEDED) {
DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
if (ret != MONITOR_OK) {
DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
- goto again;
+ return;
}
- /* update the list of public ips that a node can handle for
- all connected nodes
- */
if (ctdb->num_nodes != nodemap->num) {
DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
reload_nodes_file(ctdb);
- goto again;
- }
- for (j=0; j<nodemap->num; j++) {
- /* release any existing data */
- if (ctdb->nodes[j]->public_ips) {
- talloc_free(ctdb->nodes[j]->public_ips);
- ctdb->nodes[j]->public_ips = NULL;
- }
-
- if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
- continue;
- }
-
- /* grab a new shiny list of public ips from the node */
- if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
- ctdb->nodes[j]->pnn,
- ctdb->nodes,
- &ctdb->nodes[j]->public_ips)) {
- DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
- ctdb->nodes[j]->pnn));
- goto again;
- }
+ return;
}
-
/* verify that all active nodes agree that we are the recmaster */
switch (verify_recmaster(rec, nodemap, pnn)) {
case MONITOR_RECOVERY_NEEDED:
/* can not happen */
- goto again;
+ return;
case MONITOR_ELECTION_NEEDED:
force_election(rec, pnn, nodemap);
- goto again;
+ return;
case MONITOR_OK:
break;
case MONITOR_FAILED:
- goto again;
+ return;
}
if (rec->need_recovery) {
/* a previous recovery didn't finish */
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify that all active nodes are in normal mode
switch (verify_recmode(ctdb, nodemap)) {
case MONITOR_RECOVERY_NEEDED:
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
case MONITOR_FAILED:
- goto again;
+ return;
case MONITOR_ELECTION_NEEDED:
/* can not happen */
case MONITOR_OK:
DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
}
remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
if (remote_nodemaps == NULL) {
DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
- goto again;
+ return;
}
for(i=0; i<nodemap->num; i++) {
remote_nodemaps[i] = NULL;
}
if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
- goto again;
+ return;
}
/* verify that all other nodes have the same nodemap as we have
DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
ctdb_set_culprit(rec, j);
- goto again;
+ return;
}
/* if the nodes disagree on how many nodes there are
nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* if the nodes disagree on which nodes exist and are
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
}
}
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
} else {
DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
}
}
}
vnnmap->size, rec->num_active));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify that all active nodes in the nodemap also exist in
nodemap->nodes[j].pnn));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
}
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
nodemap->nodes[j].pnn));
- goto again;
+ return;
}
/* verify the vnnmap generation is the same */
nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify the vnnmap size is the same */
nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify the vnnmap is the same */
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
}
}
}
/* we might need to change who has what IP assigned */
if (rec->need_takeover_run) {
+ uint32_t culprit = (uint32_t)-1;
+
rec->need_takeover_run = false;
+ /* update the list of public ips that a node can handle for
+ all connected nodes
+ */
+ ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ ctdb_set_culprit(rec, culprit);
+ do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+ return;
+ }
+
/* execute the "startrecovery" event script on all nodes */
ret = run_startrecovery_eventscript(rec, nodemap);
if (ret!=0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
ret = ctdb_takeover_run(ctdb, nodemap);
DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* execute the "recovered" event script on all nodes */
}
#endif
}
+}
+/*
+ the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+ struct ctdb_recoverd *rec;
- goto again;
+ DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+ rec = talloc_zero(ctdb, struct ctdb_recoverd);
+ CTDB_NO_MEMORY_FATAL(ctdb, rec);
+
+ rec->ctdb = ctdb;
+
+ rec->priority_time = timeval_current();
+
+ /* register a message port for sending memory dumps */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+ /* register a message port for recovery elections */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
+
+ /* when nodes are disabled/enabled */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
+
+ /* when we are asked to puch out a flag change */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+ /* register a message port for vacuum fetch */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
+
+ /* register a message port for reloadnodes */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+ /* register a message port for performing a takeover run */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+ /* register a message port for disabling the ip check for a short while */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+ /* register a message port for updating the recovery daemons node assignment for an ip */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
+
+ for (;;) {
+ TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+ struct timeval start;
+ double elapsed;
+
+ if (!mem_ctx) {
+ DEBUG(DEBUG_CRIT,(__location__
+ " Failed to create temp context\n"));
+ exit(-1);
+ }
+
+ start = timeval_current();
+ main_loop(ctdb, rec, mem_ctx);
+ talloc_free(mem_ctx);
+
+ /* we only check for recovery once every second */
+ elapsed = timeval_elapsed(&start);
+ if (elapsed < ctdb->tunable.recover_interval) {
+ ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+ - elapsed);
+ }
+ }
}
/*
{
int fd[2];
struct signal_event *se;
+ struct tevent_fd *fde;
if (pipe(fd) != 0) {
return -1;
srandom(getpid() ^ time(NULL));
- if (switch_from_server_to_client(ctdb) != 0) {
+ if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
exit(1);
}
- DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+ DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
- event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+ fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
ctdb_recoverd_parent, &fd[0]);
+ tevent_fd_set_auto_close(fde);
/* set up a handler to pick up sigchld */
se = event_add_signal(ctdb->ev, ctdb,