*/
#include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
#include "system/filesys.h"
#include "system/time.h"
#include "system/network.h"
#include "system/wait.h"
#include "popt.h"
#include "cmdline.h"
-#include "../include/ctdb.h"
+#include "../include/ctdb_client.h"
#include "../include/ctdb_private.h"
#include "db_wrap.h"
#include "dlinklist.h"
TALLOC_CTX *ip_reallocate_ctx;
struct ip_reallocate_list *reallocate_callers;
TALLOC_CTX *ip_check_disable_ctx;
+ struct ctdb_control_get_ifaces *ifaces;
};
#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
{
if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
- DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
+ DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
return;
}
if (node_pnn < ctdb->num_nodes) {
pull the remote database contents from one node into the recdb
*/
static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
- struct tdb_wrap *recdb, uint32_t dbid)
+ struct tdb_wrap *recdb, uint32_t dbid,
+ bool persistent)
{
int ret;
TDB_DATA outdata;
static int pull_remote_database(struct ctdb_context *ctdb,
struct ctdb_recoverd *rec,
struct ctdb_node_map *nodemap,
- struct tdb_wrap *recdb, uint32_t dbid)
+ struct tdb_wrap *recdb, uint32_t dbid,
+ bool persistent)
{
int j;
if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
continue;
}
- if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
+ if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
nodemap->nodes[j].pnn));
ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
/*
wait for a given number of seconds
*/
-static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
{
uint32_t timed_out = 0;
- event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
+ time_t usecs = (secs - (time_t)secs) * 1000000;
+ event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
while (!timed_out) {
event_loop_once(ctdb->ev);
}
{
struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
rec->election_timeout = NULL;
+ fast_start = false;
DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
}
unsigned tdb_flags;
/* open up the temporary recovery database */
- name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
+ name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
+ ctdb->db_directory_state,
+ ctdb->pnn);
if (name == NULL) {
return NULL;
}
unlink(name);
tdb_flags = TDB_NOLOCK;
- if (!ctdb->do_setsched) {
+ if (ctdb->valgrinding) {
tdb_flags |= TDB_NOMMAP;
}
+ tdb_flags |= TDB_DISALLOW_NESTING;
recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
struct ctdb_marshall_buffer *recdata;
uint32_t len;
bool failed;
+ bool persistent;
};
static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
/* update the dmaster field to point to us */
hdr = (struct ctdb_ltdb_header *)data.dptr;
- hdr->dmaster = params->ctdb->pnn;
+ if (!params->persistent) {
+ hdr->dmaster = params->ctdb->pnn;
+ }
/* add the record to the blob ready to send to the nodes */
rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
push the recdb database out to all nodes
*/
static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
+ bool persistent,
struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
{
struct recdb_data params;
params.recdata = recdata;
params.len = offsetof(struct ctdb_marshall_buffer, data);
params.failed = false;
+ params.persistent = persistent;
if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
static int recover_database(struct ctdb_recoverd *rec,
TALLOC_CTX *mem_ctx,
uint32_t dbid,
+ bool persistent,
uint32_t pnn,
struct ctdb_node_map *nodemap,
uint32_t transaction_id)
}
/* pull all remote databases onto the recdb */
- ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
+ ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
return -1;
/* push out the correct database. This sets the dmaster and skips
the empty records */
- ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
+ ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
if (ret != 0) {
talloc_free(recdb);
return -1;
ctdb_load_nodes_file(ctdb);
}
-
+static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec,
+ struct ctdb_node_map *nodemap,
+ uint32_t *culprit)
+{
+ int j;
+ int ret;
+
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
+ ctdb->num_nodes, nodemap->num));
+ if (culprit) {
+ *culprit = ctdb->pnn;
+ }
+ return -1;
+ }
+
+ for (j=0; j<nodemap->num; j++) {
+ /* release any existing data */
+ if (ctdb->nodes[j]->known_public_ips) {
+ talloc_free(ctdb->nodes[j]->known_public_ips);
+ ctdb->nodes[j]->known_public_ips = NULL;
+ }
+ if (ctdb->nodes[j]->available_public_ips) {
+ talloc_free(ctdb->nodes[j]->available_public_ips);
+ ctdb->nodes[j]->available_public_ips = NULL;
+ }
+
+ if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+
+ /* grab a new shiny list of public ips from the node */
+ ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+ CONTROL_TIMEOUT(),
+ ctdb->nodes[j]->pnn,
+ ctdb->nodes,
+ 0,
+ &ctdb->nodes[j]->known_public_ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
+ ctdb->nodes[j]->pnn));
+ if (culprit) {
+ *culprit = ctdb->nodes[j]->pnn;
+ }
+ return -1;
+ }
+
+ if (ctdb->tunable.disable_ip_failover == 0) {
+ if (rec->ip_check_disable_ctx == NULL) {
+ if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
+ DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
+ rec->need_takeover_run = true;
+ }
+ }
+ }
+
+ /* grab a new shiny list of public ips from the node */
+ ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+ CONTROL_TIMEOUT(),
+ ctdb->nodes[j]->pnn,
+ ctdb->nodes,
+ CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
+ &ctdb->nodes[j]->available_public_ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
+ ctdb->nodes[j]->pnn));
+ if (culprit) {
+ *culprit = ctdb->nodes[j]->pnn;
+ }
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* when we start a recovery, make sure all nodes use the same reclock file
+ setting
+*/
+static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ TDB_DATA data;
+ uint32_t *nodes;
+
+ if (ctdb->recovery_lock_file == NULL) {
+ data.dptr = NULL;
+ data.dsize = 0;
+ } else {
+ data.dsize = strlen(ctdb->recovery_lock_file) + 1;
+ data.dptr = (uint8_t *)ctdb->recovery_lock_file;
+ }
+
+ nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
+ nodes, 0,
+ CONTROL_TIMEOUT(),
+ false, data,
+ NULL, NULL,
+ rec) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+
/*
we are the recmaster, and recovery is needed - start a recovery run
*/
TDB_DATA data;
uint32_t *nodes;
struct timeval start_time;
+ uint32_t culprit = (uint32_t)-1;
DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
return -1;
}
ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
- DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
+ DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
}
DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
+ /* update all other nodes to use the same setting for reclock files
+ as the local recovery master.
+ */
+ sync_recovery_lock_file_across_cluster(rec);
+
/* set recovery mode to active on all nodes */
ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
if (ret != 0) {
return -1;
}
+ /*
+ update all nodes to have the same flags that we have
+ */
+ for (i=0;i<nodemap->num;i++) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+ return -1;
+ }
+ }
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
/* pick a new generation number */
generation = new_generation();
DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
for (i=0;i<dbmap->num;i++) {
- if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
+ ret = recover_database(rec, mem_ctx,
+ dbmap->dbs[i].dbid,
+ dbmap->dbs[i].persistent,
+ pnn, nodemap, generation);
+ if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
return -1;
}
/*
tell nodes to takeover their public IPs
*/
+ ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ return -1;
+ }
rec->need_takeover_run = false;
ret = ctdb_takeover_run(ctdb, nodemap);
if (ret != 0) {
/* send a message to all clients telling them that the cluster
has been reconfigured */
- ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
+ ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
We now wait for rerecovery_timeout before we allow
another recovery to take place.
*/
- DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
+ DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
- DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
+ DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
return 0;
}
/* send an election message to all active nodes */
DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
- ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+ ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
/* A new node that is already frozen has entered the cluster.
DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
- ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+ ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
talloc_free(tmp_ctx);
rec->ip_check_disable_ctx = NULL;
}
+
+static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
+ TDB_DATA data, void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+ struct ctdb_public_ip *ip;
+
+ if (rec->recmaster != rec->ctdb->pnn) {
+ DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
+ return;
+ }
+
+ if (data.dsize != sizeof(struct ctdb_public_ip)) {
+ DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
+ return;
+ }
+
+ ip = (struct ctdb_public_ip *)data.dptr;
+
+ update_ip_assignment_tree(rec->ctdb, ip);
+}
+
+
static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
TDB_DATA data, void *private_data)
{
TDB_DATA result;
int32_t ret;
struct ip_reallocate_list *callers;
+ uint32_t culprit;
DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
- ret = ctdb_takeover_run(ctdb, rec->nodemap);
+
+ /* update the list of public ips that a node can handle for
+ all connected nodes
+ */
+ ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ rec->need_takeover_run = true;
+ }
+ if (ret == 0) {
+ ret = ctdb_takeover_run(ctdb, rec->nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ rec->need_takeover_run = true;
+ }
+ }
+
result.dsize = sizeof(int32_t);
result.dptr = (uint8_t *)&ret;
for (callers=rec->reallocate_callers; callers; callers=callers->next) {
+
+ /* Someone that sent srvid==0 does not want a reply */
+ if (callers->rd->srvid == 0) {
+ continue;
+ }
DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
"%u:%llu\n", (unsigned)callers->rd->pnn,
(unsigned long long)callers->rd->srvid));
- ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
+ ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
"message to %u:%llu\n",
/* we got an election packet - update the timeout for the election */
talloc_free(rec->election_timeout);
rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
+ fast_start ?
+ timeval_current_ofs(0, 500000) :
timeval_current_ofs(ctdb->tunable.election_timeout, 0),
ctdb_election_timeout, rec);
talloc_free(rec->election_timeout);
rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
+ fast_start ?
+ timeval_current_ofs(0, 500000) :
timeval_current_ofs(ctdb->tunable.election_timeout, 0),
ctdb_election_timeout, rec);
{
int ret;
struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+ struct ctdb_node_map *nodemap=NULL;
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+ uint32_t recmaster;
+ uint32_t *nodes;
- ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
+ /* find the recovery master */
+ ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ /* read the node flags from the recmaster */
+ ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
+ talloc_free(tmp_ctx);
+ return;
}
+ if (c->pnn >= nodemap->num) {
+ DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ /* send the flags update to all connected nodes */
+ nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
+ nodes, 0, CONTROL_TIMEOUT(),
+ false, data,
+ NULL, NULL,
+ NULL) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
+
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ talloc_free(tmp_ctx);
}
}
-/* called to check that the allocation of public ip addresses is ok.
+/* called to check that the local allocation of public ip addresses is ok.
*/
-static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
+static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
{
TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ struct ctdb_control_get_ifaces *ifaces = NULL;
struct ctdb_all_public_ips *ips = NULL;
struct ctdb_uptime *uptime1 = NULL;
struct ctdb_uptime *uptime2 = NULL;
int ret, j;
+ bool need_iface_check = false;
+ bool need_takeover_run = false;
ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
CTDB_CURRENT_NODE, &uptime1);
return -1;
}
+
+ /* read the interfaces from the local node */
+ ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
+ talloc_free(mem_ctx);
+ return -1;
+ }
+
+ if (!rec->ifaces) {
+ need_iface_check = true;
+ } else if (rec->ifaces->num != ifaces->num) {
+ need_iface_check = true;
+ } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
+ need_iface_check = true;
+ }
+
+ if (need_iface_check) {
+ DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
+ "local node %u - force takeover run\n",
+ pnn));
+ need_takeover_run = true;
+ }
+
/* read the ip allocation from the local node */
ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
if (ret != 0) {
return 0;
}
+ talloc_free(rec->ifaces);
+ rec->ifaces = talloc_steal(rec, ifaces);
+
/* verify that we have the ip addresses we should have
and we dont have ones we shouldnt have.
if we find an inconsistency we set recmode to
active on the local node and wait for the recmaster
- to do a full blown recovery
+ to do a full blown recovery.
+ also if the pnn is -1 and we are healthy and can host the ip
+ we also request a ip reallocation.
*/
- for (j=0; j<ips->num; j++) {
- if (ips->ips[j].pnn == pnn) {
- if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
- DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
- ctdb_addr_to_str(&ips->ips[j].addr)));
- ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
-
- talloc_free(mem_ctx);
- return -1;
+ if (ctdb->tunable.disable_ip_failover == 0) {
+ for (j=0; j<ips->num; j++) {
+ if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
+ DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
+ } else if (ips->ips[j].pnn == pnn) {
+ if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
}
- ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
-
- talloc_free(mem_ctx);
- return -1;
+ } else {
+ if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
}
}
- } else {
- if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
- DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
- ctdb_addr_to_str(&ips->ips[j].addr)));
+ }
+ }
- ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+ if (need_takeover_run) {
+ struct takeover_run_reply rd;
+ TDB_DATA data;
- talloc_free(mem_ctx);
- return -1;
- }
- ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
+ DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
- talloc_free(mem_ctx);
- return -1;
- }
- }
+ rd.pnn = ctdb->pnn;
+ rd.srvid = 0;
+ data.dptr = (uint8_t *)&rd;
+ data.dsize = sizeof(rd);
+
+ ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
}
}
-
talloc_free(mem_ctx);
return 0;
}
close(state->fd[0]);
state->fd[0] = -1;
+ debug_extra = talloc_asprintf(NULL, "recovery-lock:");
if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
cc = RECLOCK_FAILED;
}
state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
- EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+ EVENT_FD_READ,
reclock_child_handler,
(void *)state);
talloc_free(state);
return -1;
}
+ tevent_fd_set_auto_close(state->fde);
while (state->status == RECLOCK_CHECKING) {
event_loop_once(ctdb->ev);
talloc_free(tmp_ctx);
return 0;
}
-
-/*
- the main monitoring loop
- */
-static void monitor_cluster(struct ctdb_context *ctdb)
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+ TALLOC_CTX *mem_ctx)
{
uint32_t pnn;
- TALLOC_CTX *mem_ctx=NULL;
struct ctdb_node_map *nodemap=NULL;
struct ctdb_node_map *recmaster_nodemap=NULL;
struct ctdb_node_map **remote_nodemaps=NULL;
struct ctdb_vnn_map *remote_vnnmap=NULL;
int32_t debug_level;
int i, j, ret;
- struct ctdb_recoverd *rec;
-
- DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
-
- rec = talloc_zero(ctdb, struct ctdb_recoverd);
- CTDB_NO_MEMORY_FATAL(ctdb, rec);
- rec->ctdb = ctdb;
-
- rec->priority_time = timeval_current();
- /* register a message port for sending memory dumps */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
-
- /* register a message port for recovery elections */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
-
- /* when nodes are disabled/enabled */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
-
- /* when we are asked to puch out a flag change */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
-
- /* register a message port for vacuum fetch */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
-
- /* register a message port for reloadnodes */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
-
- /* register a message port for performing a takeover run */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
-
- /* register a message port for disabling the ip check for a short while */
- ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
-
-again:
- if (mem_ctx) {
- talloc_free(mem_ctx);
- mem_ctx = NULL;
- }
- mem_ctx = talloc_new(ctdb);
- if (!mem_ctx) {
- DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
- exit(-1);
- }
-
- /* we only check for recovery once every second */
- ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
/* verify that the main daemon is still running */
if (kill(ctdb->ctdbd_pid, 0) != 0) {
if (rec->election_timeout) {
/* an election is in progress */
- goto again;
+ return;
}
/* read the debug level from the parent and update locally */
ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
if (ret !=0) {
DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
- goto again;
+ return;
}
LogLevel = debug_level;
ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
- goto again;
+ return;
}
/* get the current recovery lock file from the server */
if (update_recovery_lock_file(ctdb) != 0) {
DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
- goto again;
+ return;
}
/* Make sure that if recovery lock verification becomes disabled when
pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
if (pnn == (uint32_t)-1) {
DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
- goto again;
+ return;
}
/* get the vnnmap */
ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
- goto again;
+ return;
}
ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
- goto again;
+ return;
}
nodemap = rec->nodemap;
ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
- goto again;
+ return;
}
/* if we are not the recmaster we can safely ignore any ip reallocate requests */
if (rec->recmaster == (uint32_t)-1) {
DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
if (ret != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
- goto again;
+ return;
}
ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
if (ret != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
- goto again;
+ return;
}
- goto again;
+ return;
}
}
/* If the local node is stopped, verify we are not the recmaster
if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* check that we (recovery daemon) and the local ctdb daemon
if (j == nodemap->num) {
DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* if recovery master is disconnected we must elect a new recmaster */
if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* grap the nodemap from the recovery master to check if it is banned */
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
nodemap->nodes[j].pnn));
- goto again;
+ return;
}
if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
/* verify that we have all ip addresses we should have and we dont
* have addresses we shouldnt have.
*/
- if (ctdb->do_checkpublicip) {
+ if (ctdb->tunable.disable_ip_failover == 0) {
if (rec->ip_check_disable_ctx == NULL) {
- if (verify_ip_allocation(ctdb, pnn) != 0) {
+ if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
- goto again;
}
}
}
if recovery is needed
*/
if (pnn != rec->recmaster) {
- goto again;
+ return;
}
if (ret == MONITOR_ELECTION_NEEDED) {
DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
force_election(rec, pnn, nodemap);
- goto again;
+ return;
}
if (ret != MONITOR_OK) {
DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
- goto again;
+ return;
}
- /* update the list of public ips that a node can handle for
- all connected nodes
- */
if (ctdb->num_nodes != nodemap->num) {
DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
reload_nodes_file(ctdb);
- goto again;
- }
- for (j=0; j<nodemap->num; j++) {
- /* release any existing data */
- if (ctdb->nodes[j]->public_ips) {
- talloc_free(ctdb->nodes[j]->public_ips);
- ctdb->nodes[j]->public_ips = NULL;
- }
-
- if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
- continue;
- }
-
- /* grab a new shiny list of public ips from the node */
- if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
- ctdb->nodes[j]->pnn,
- ctdb->nodes,
- &ctdb->nodes[j]->public_ips)) {
- DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
- ctdb->nodes[j]->pnn));
- goto again;
- }
+ return;
}
-
/* verify that all active nodes agree that we are the recmaster */
switch (verify_recmaster(rec, nodemap, pnn)) {
case MONITOR_RECOVERY_NEEDED:
/* can not happen */
- goto again;
+ return;
case MONITOR_ELECTION_NEEDED:
force_election(rec, pnn, nodemap);
- goto again;
+ return;
case MONITOR_OK:
break;
case MONITOR_FAILED:
- goto again;
+ return;
}
if (rec->need_recovery) {
/* a previous recovery didn't finish */
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify that all active nodes are in normal mode
switch (verify_recmode(ctdb, nodemap)) {
case MONITOR_RECOVERY_NEEDED:
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
case MONITOR_FAILED:
- goto again;
+ return;
case MONITOR_ELECTION_NEEDED:
/* can not happen */
case MONITOR_OK:
DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
}
remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
if (remote_nodemaps == NULL) {
DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
- goto again;
+ return;
}
for(i=0; i<nodemap->num; i++) {
remote_nodemaps[i] = NULL;
}
if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
- goto again;
+ return;
}
/* verify that all other nodes have the same nodemap as we have
DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
ctdb_set_culprit(rec, j);
- goto again;
+ return;
}
/* if the nodes disagree on how many nodes there are
nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* if the nodes disagree on which nodes exist and are
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
}
}
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
} else {
DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
}
}
}
vnnmap->size, rec->num_active));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify that all active nodes in the nodemap also exist in
nodemap->nodes[j].pnn));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
}
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
nodemap->nodes[j].pnn));
- goto again;
+ return;
}
/* verify the vnnmap generation is the same */
nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify the vnnmap size is the same */
nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
- goto again;
+ return;
}
/* verify the vnnmap is the same */
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap);
- goto again;
+ return;
}
}
}
/* we might need to change who has what IP assigned */
if (rec->need_takeover_run) {
+ uint32_t culprit = (uint32_t)-1;
+
rec->need_takeover_run = false;
+ /* update the list of public ips that a node can handle for
+ all connected nodes
+ */
+ ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+ culprit));
+ ctdb_set_culprit(rec, culprit);
+ do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+ return;
+ }
+
/* execute the "startrecovery" event script on all nodes */
ret = run_startrecovery_eventscript(rec, nodemap);
if (ret!=0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+ return;
}
ret = ctdb_takeover_run(ctdb, nodemap);
DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
ctdb_set_culprit(rec, ctdb->pnn);
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+ return;
}
/* execute the "recovered" event script on all nodes */
}
#endif
}
+}
+/*
+ the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+ struct ctdb_recoverd *rec;
+
+ DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+
+ rec = talloc_zero(ctdb, struct ctdb_recoverd);
+ CTDB_NO_MEMORY_FATAL(ctdb, rec);
- goto again;
+ rec->ctdb = ctdb;
+
+ rec->priority_time = timeval_current();
+
+ /* register a message port for sending memory dumps */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+ /* register a message port for recovery elections */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
+
+ /* when nodes are disabled/enabled */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
+ /* when we are asked to puch out a flag change */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+ /* register a message port for vacuum fetch */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
+
+ /* register a message port for reloadnodes */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+ /* register a message port for performing a takeover run */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+ /* register a message port for disabling the ip check for a short while */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+ /* register a message port for updating the recovery daemons node assignment for an ip */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
+
+ for (;;) {
+ TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+ struct timeval start;
+ double elapsed;
+
+ if (!mem_ctx) {
+ DEBUG(DEBUG_CRIT,(__location__
+ " Failed to create temp context\n"));
+ exit(-1);
+ }
+
+ start = timeval_current();
+ main_loop(ctdb, rec, mem_ctx);
+ talloc_free(mem_ctx);
+
+ /* we only check for recovery once every second */
+ elapsed = timeval_elapsed(&start);
+ if (elapsed < ctdb->tunable.recover_interval) {
+ ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+ - elapsed);
+ }
+ }
}
/*
if (ctdb->methods != NULL) {
ctdb->methods->shutdown(ctdb);
}
- ctdb_event_script(ctdb, "shutdown");
+ ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
exit(10);
}
{
int fd[2];
struct signal_event *se;
+ struct tevent_fd *fde;
if (pipe(fd) != 0) {
return -1;
srandom(getpid() ^ time(NULL));
- if (switch_from_server_to_client(ctdb) != 0) {
+ if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
exit(1);
}
- DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+ DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
- event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+ fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
ctdb_recoverd_parent, &fd[0]);
+ tevent_fd_set_auto_close(fde);
/* set up a handler to pick up sigchld */
se = event_add_signal(ctdb->ev, ctdb,