4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
36 #include "ctdb_private.h"
37 #include "ctdb_client.h"
38 #include "ctdb_logging.h"
40 #include "common/system.h"
41 #include "common/cmdline.h"
42 #include "common/common.h"
45 /* List of SRVID requests that need to be processed */
47 struct srvid_list *next, *prev;
48 struct srvid_request *request;
51 struct srvid_requests {
52 struct srvid_list *requests;
55 static void srvid_request_reply(struct ctdb_context *ctdb,
56 struct srvid_request *request,
59 /* Someone that sent srvid==0 does not want a reply */
60 if (request->srvid == 0) {
65 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
67 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
68 (unsigned)request->pnn,
69 (unsigned long long)request->srvid));
71 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
79 static void srvid_requests_reply(struct ctdb_context *ctdb,
80 struct srvid_requests **requests,
85 for (r = (*requests)->requests; r != NULL; r = r->next) {
86 srvid_request_reply(ctdb, r->request, result);
89 /* Free the list structure... */
90 TALLOC_FREE(*requests);
93 static void srvid_request_add(struct ctdb_context *ctdb,
94 struct srvid_requests **requests,
95 struct srvid_request *request)
101 if (*requests == NULL) {
102 *requests = talloc_zero(ctdb, struct srvid_requests);
103 if (*requests == NULL) {
108 t = talloc_zero(*requests, struct srvid_list);
110 /* If *requests was just allocated above then free it */
111 if ((*requests)->requests == NULL) {
112 TALLOC_FREE(*requests);
117 t->request = (struct srvid_request *)talloc_steal(t, request);
118 DLIST_ADD((*requests)->requests, t);
123 /* Failed to add the request to the list. Send a fail. */
124 DEBUG(DEBUG_ERR, (__location__
125 " Out of memory, failed to queue SRVID request\n"));
127 result.dsize = sizeof(ret);
128 result.dptr = (uint8_t *)&ret;
129 srvid_request_reply(ctdb, request, result);
132 /* An abstraction to allow an operation (takeover runs, recoveries,
133 * ...) to be disabled for a given timeout */
134 struct ctdb_op_state {
135 struct tevent_timer *timer;
140 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
142 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
145 state->in_progress = false;
152 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
154 return state->timer != NULL;
157 static bool ctdb_op_begin(struct ctdb_op_state *state)
159 if (ctdb_op_is_disabled(state)) {
161 ("Unable to begin - %s are disabled\n", state->name));
165 state->in_progress = true;
169 static bool ctdb_op_end(struct ctdb_op_state *state)
171 return state->in_progress = false;
174 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
176 return state->in_progress;
179 static void ctdb_op_enable(struct ctdb_op_state *state)
181 TALLOC_FREE(state->timer);
184 static void ctdb_op_timeout_handler(struct tevent_context *ev,
185 struct tevent_timer *te,
186 struct timeval yt, void *p)
188 struct ctdb_op_state *state =
189 talloc_get_type(p, struct ctdb_op_state);
191 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
192 ctdb_op_enable(state);
195 static int ctdb_op_disable(struct ctdb_op_state *state,
196 struct tevent_context *ev,
200 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
201 ctdb_op_enable(state);
205 if (state->in_progress) {
207 ("Unable to disable %s - in progress\n", state->name));
211 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
212 state->name, timeout));
214 /* Clear any old timers */
215 talloc_free(state->timer);
217 /* Arrange for the timeout to occur */
218 state->timer = tevent_add_timer(ev, state,
219 timeval_current_ofs(timeout, 0),
220 ctdb_op_timeout_handler, state);
221 if (state->timer == NULL) {
222 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
229 struct ctdb_banning_state {
231 struct timeval last_reported_time;
235 private state of recovery daemon
237 struct ctdb_recoverd {
238 struct ctdb_context *ctdb;
240 uint32_t last_culprit_node;
241 struct ctdb_node_map_old *nodemap;
242 struct timeval priority_time;
243 bool need_takeover_run;
246 struct tevent_timer *send_election_te;
247 struct tevent_timer *election_timeout;
248 struct srvid_requests *reallocate_requests;
249 struct ctdb_op_state *takeover_run;
250 struct ctdb_op_state *recovery;
251 struct ctdb_control_get_ifaces *ifaces;
252 uint32_t *force_rebalance_nodes;
253 struct ctdb_node_capabilities *caps;
256 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
257 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
259 static void ctdb_restart_recd(struct tevent_context *ev,
260 struct tevent_timer *te, struct timeval t,
264 ban a node for a period of time
266 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
269 struct ctdb_context *ctdb = rec->ctdb;
270 struct ctdb_ban_time bantime;
272 if (!ctdb_validate_pnn(ctdb, pnn)) {
273 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
277 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
280 bantime.time = ban_time;
282 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
284 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
290 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
294 remember the trouble maker
296 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
298 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
299 struct ctdb_banning_state *ban_state;
301 if (culprit > ctdb->num_nodes) {
302 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
306 /* If we are banned or stopped, do not set other nodes as culprits */
307 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
308 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
312 if (ctdb->nodes[culprit]->ban_state == NULL) {
313 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
314 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
318 ban_state = ctdb->nodes[culprit]->ban_state;
319 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
320 /* this was the first time in a long while this node
321 misbehaved so we will forgive any old transgressions.
323 ban_state->count = 0;
326 ban_state->count += count;
327 ban_state->last_reported_time = timeval_current();
328 rec->last_culprit_node = culprit;
332 remember the trouble maker
334 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
336 ctdb_set_culprit_count(rec, culprit, 1);
340 /* this callback is called for every node that failed to execute the
343 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
345 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
347 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
349 ctdb_set_culprit(rec, node_pnn);
353 run the "recovered" eventscript on all nodes
355 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
359 struct ctdb_context *ctdb = rec->ctdb;
361 tmp_ctx = talloc_new(ctdb);
362 CTDB_NO_MEMORY(ctdb, tmp_ctx);
364 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
365 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
367 CONTROL_TIMEOUT(), false, tdb_null,
368 NULL, recovered_fail_callback,
370 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
372 talloc_free(tmp_ctx);
376 talloc_free(tmp_ctx);
380 /* this callback is called for every node that failed to execute the
383 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
385 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
387 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
389 ctdb_set_culprit(rec, node_pnn);
393 run the "startrecovery" eventscript on all nodes
395 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
399 struct ctdb_context *ctdb = rec->ctdb;
401 tmp_ctx = talloc_new(ctdb);
402 CTDB_NO_MEMORY(ctdb, tmp_ctx);
404 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
405 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
407 CONTROL_TIMEOUT(), false, tdb_null,
409 startrecovery_fail_callback,
411 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
412 talloc_free(tmp_ctx);
416 talloc_free(tmp_ctx);
421 update the node capabilities for all connected nodes
423 static int update_capabilities(struct ctdb_recoverd *rec,
424 struct ctdb_node_map_old *nodemap)
428 struct ctdb_node_capabilities *caps;
429 struct ctdb_context *ctdb = rec->ctdb;
431 tmp_ctx = talloc_new(rec);
432 CTDB_NO_MEMORY(ctdb, tmp_ctx);
434 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
435 CONTROL_TIMEOUT(), nodemap);
439 (__location__ " Failed to get node capabilities\n"));
440 talloc_free(tmp_ctx);
444 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
448 " Capabilities don't include current node.\n"));
449 talloc_free(tmp_ctx);
452 ctdb->capabilities = *capp;
454 TALLOC_FREE(rec->caps);
455 rec->caps = talloc_steal(rec, caps);
457 talloc_free(tmp_ctx);
461 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
463 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
465 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
466 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
469 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
471 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
473 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
474 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
478 change recovery mode on all nodes
480 static int set_recovery_mode(struct ctdb_context *ctdb,
481 struct ctdb_recoverd *rec,
482 struct ctdb_node_map_old *nodemap,
483 uint32_t rec_mode, bool freeze)
489 tmp_ctx = talloc_new(ctdb);
490 CTDB_NO_MEMORY(ctdb, tmp_ctx);
492 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
494 data.dsize = sizeof(uint32_t);
495 data.dptr = (unsigned char *)&rec_mode;
497 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
503 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
504 talloc_free(tmp_ctx);
508 /* freeze all nodes */
509 if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
512 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
513 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
518 set_recmode_fail_callback,
520 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
521 talloc_free(tmp_ctx);
527 talloc_free(tmp_ctx);
532 change recovery master on all node
534 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn)
540 tmp_ctx = talloc_new(ctdb);
541 CTDB_NO_MEMORY(ctdb, tmp_ctx);
543 data.dsize = sizeof(uint32_t);
544 data.dptr = (unsigned char *)&pnn;
546 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
547 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
549 CONTROL_TIMEOUT(), false, data,
552 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
553 talloc_free(tmp_ctx);
557 talloc_free(tmp_ctx);
561 /* update all remote nodes to use the same db priority that we have
562 this can fail if the remove node has not yet been upgraded to
563 support this function, so we always return success and never fail
564 a recovery if this call fails.
566 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
567 struct ctdb_node_map_old *nodemap,
568 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
572 /* step through all local databases */
573 for (db=0; db<dbmap->num;db++) {
574 struct ctdb_db_priority db_prio;
577 db_prio.db_id = dbmap->dbs[db].db_id;
578 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
580 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
584 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
586 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
587 CTDB_CURRENT_NODE, &db_prio);
589 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
598 ensure all other nodes have attached to any databases that we have
600 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
601 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
604 struct ctdb_dbid_map_old *remote_dbmap;
606 /* verify that all other nodes have all our databases */
607 for (j=0; j<nodemap->num; j++) {
608 /* we dont need to ourself ourselves */
609 if (nodemap->nodes[j].pnn == pnn) {
612 /* dont check nodes that are unavailable */
613 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
617 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
618 mem_ctx, &remote_dbmap);
620 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
624 /* step through all local databases */
625 for (db=0; db<dbmap->num;db++) {
629 for (i=0;i<remote_dbmap->num;i++) {
630 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
634 /* the remote node already have this database */
635 if (i!=remote_dbmap->num) {
638 /* ok so we need to create this database */
639 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
640 dbmap->dbs[db].db_id, mem_ctx,
643 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
646 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
647 nodemap->nodes[j].pnn,
649 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
651 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
662 ensure we are attached to any databases that anyone else is attached to
664 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
665 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
668 struct ctdb_dbid_map_old *remote_dbmap;
670 /* verify that we have all database any other node has */
671 for (j=0; j<nodemap->num; j++) {
672 /* we dont need to ourself ourselves */
673 if (nodemap->nodes[j].pnn == pnn) {
676 /* dont check nodes that are unavailable */
677 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
681 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
682 mem_ctx, &remote_dbmap);
684 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
688 /* step through all databases on the remote node */
689 for (db=0; db<remote_dbmap->num;db++) {
692 for (i=0;i<(*dbmap)->num;i++) {
693 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
697 /* we already have this db locally */
698 if (i!=(*dbmap)->num) {
701 /* ok so we need to create this database and
704 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
705 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
707 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
708 nodemap->nodes[j].pnn));
711 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
712 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
714 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
717 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
719 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
730 pull the remote database contents from one node into the recdb
732 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
733 struct tdb_wrap *recdb, uint32_t dbid)
737 struct ctdb_marshall_buffer *reply;
738 struct ctdb_rec_data_old *recdata;
740 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
742 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
743 CONTROL_TIMEOUT(), &outdata);
745 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
746 talloc_free(tmp_ctx);
750 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
752 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
753 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
754 talloc_free(tmp_ctx);
758 recdata = (struct ctdb_rec_data_old *)&reply->data[0];
762 recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
764 struct ctdb_ltdb_header *hdr;
767 key.dptr = &recdata->data[0];
768 key.dsize = recdata->keylen;
769 data.dptr = &recdata->data[key.dsize];
770 data.dsize = recdata->datalen;
772 hdr = (struct ctdb_ltdb_header *)data.dptr;
774 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
775 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
776 talloc_free(tmp_ctx);
780 /* fetch the existing record, if any */
781 existing = tdb_fetch(recdb->tdb, key);
783 if (existing.dptr != NULL) {
784 struct ctdb_ltdb_header header;
785 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
786 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
787 (unsigned)existing.dsize, srcnode));
789 talloc_free(tmp_ctx);
792 header = *(struct ctdb_ltdb_header *)existing.dptr;
794 if (!(header.rsn < hdr->rsn ||
795 (header.dmaster != ctdb_get_pnn(ctdb) &&
796 header.rsn == hdr->rsn))) {
801 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
802 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
803 talloc_free(tmp_ctx);
808 talloc_free(tmp_ctx);
814 struct pull_seqnum_cbdata {
820 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
822 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
825 if (cb_data->failed != 0) {
826 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
831 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
836 if (outdata.dsize != sizeof(uint64_t)) {
837 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
838 cb_data->failed = -1;
842 seqnum = *((uint64_t *)outdata.dptr);
844 if (seqnum > cb_data->seqnum ||
845 (cb_data->pnn == -1 && seqnum == 0)) {
846 cb_data->seqnum = seqnum;
847 cb_data->pnn = node_pnn;
851 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
853 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
855 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
859 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
860 struct ctdb_recoverd *rec,
861 struct ctdb_node_map_old *nodemap,
862 struct tdb_wrap *recdb, uint32_t dbid)
864 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
868 struct pull_seqnum_cbdata *cb_data;
870 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
875 data.dsize = sizeof(outdata);
876 data.dptr = (uint8_t *)&outdata[0];
878 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
879 if (cb_data == NULL) {
880 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
881 talloc_free(tmp_ctx);
889 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
890 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
892 CONTROL_TIMEOUT(), false, data,
896 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
898 talloc_free(tmp_ctx);
902 if (cb_data->failed != 0) {
903 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
904 talloc_free(tmp_ctx);
908 if (cb_data->pnn == -1) {
909 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
910 talloc_free(tmp_ctx);
914 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
916 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
917 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
918 talloc_free(tmp_ctx);
922 talloc_free(tmp_ctx);
928 pull all the remote database contents into the recdb
930 static int pull_remote_database(struct ctdb_context *ctdb,
931 struct ctdb_recoverd *rec,
932 struct ctdb_node_map_old *nodemap,
933 struct tdb_wrap *recdb, uint32_t dbid,
938 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
940 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
946 /* pull all records from all other nodes across onto this node
947 (this merges based on rsn)
949 for (j=0; j<nodemap->num; j++) {
950 /* dont merge from nodes that are unavailable */
951 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
954 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
955 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
956 nodemap->nodes[j].pnn));
957 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
967 update flags on all active nodes
969 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
973 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
975 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
983 ensure all nodes have the same vnnmap we do
985 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
986 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
990 /* push the new vnn map out to all the nodes */
991 for (j=0; j<nodemap->num; j++) {
992 /* dont push to nodes that are unavailable */
993 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
997 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
999 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1009 called when a vacuum fetch has completed - just free it and do the next one
1011 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
1018 * Process one elements of the vacuum fetch list:
1019 * Migrate it over to us with the special flag
1020 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
1022 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
1024 struct ctdb_rec_data_old *r)
1026 struct ctdb_client_call_state *state;
1028 struct ctdb_ltdb_header *hdr;
1029 struct ctdb_call call;
1032 call.call_id = CTDB_NULL_FUNC;
1033 call.flags = CTDB_IMMEDIATE_MIGRATION;
1034 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1036 call.key.dptr = &r->data[0];
1037 call.key.dsize = r->keylen;
1039 /* ensure we don't block this daemon - just skip a record if we can't get
1041 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1045 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1046 if (data.dptr == NULL) {
1047 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1051 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1053 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1057 hdr = (struct ctdb_ltdb_header *)data.dptr;
1058 if (hdr->dmaster == pnn) {
1059 /* its already local */
1061 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1067 state = ctdb_call_send(ctdb_db, &call);
1068 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1069 if (state == NULL) {
1070 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1073 state->async.fn = vacuum_fetch_callback;
1074 state->async.private_data = NULL;
1081 handler for vacuum fetch
1083 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1086 struct ctdb_recoverd *rec = talloc_get_type(
1087 private_data, struct ctdb_recoverd);
1088 struct ctdb_context *ctdb = rec->ctdb;
1089 struct ctdb_marshall_buffer *recs;
1091 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1093 struct ctdb_dbid_map_old *dbmap=NULL;
1094 bool persistent = false;
1095 struct ctdb_db_context *ctdb_db;
1096 struct ctdb_rec_data_old *r;
1098 recs = (struct ctdb_marshall_buffer *)data.dptr;
1100 if (recs->count == 0) {
1104 /* work out if the database is persistent */
1105 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1107 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1111 for (i=0;i<dbmap->num;i++) {
1112 if (dbmap->dbs[i].db_id == recs->db_id) {
1113 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1117 if (i == dbmap->num) {
1118 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1122 /* find the name of this database */
1123 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1124 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1129 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1130 if (ctdb_db == NULL) {
1131 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1135 r = (struct ctdb_rec_data_old *)&recs->data[0];
1136 while (recs->count) {
1139 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1144 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
1149 talloc_free(tmp_ctx);
1154 * handler for database detach
1156 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1159 struct ctdb_recoverd *rec = talloc_get_type(
1160 private_data, struct ctdb_recoverd);
1161 struct ctdb_context *ctdb = rec->ctdb;
1163 struct ctdb_db_context *ctdb_db;
1165 if (data.dsize != sizeof(db_id)) {
1168 db_id = *(uint32_t *)data.dptr;
1170 ctdb_db = find_ctdb_db(ctdb, db_id);
1171 if (ctdb_db == NULL) {
1172 /* database is not attached */
1176 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1178 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1180 talloc_free(ctdb_db);
1184 called when ctdb_wait_timeout should finish
1186 static void ctdb_wait_handler(struct tevent_context *ev,
1187 struct tevent_timer *te,
1188 struct timeval yt, void *p)
1190 uint32_t *timed_out = (uint32_t *)p;
1195 wait for a given number of seconds
1197 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1199 uint32_t timed_out = 0;
1200 time_t usecs = (secs - (time_t)secs) * 1000000;
1201 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1202 ctdb_wait_handler, &timed_out);
1203 while (!timed_out) {
1204 tevent_loop_once(ctdb->ev);
1209 called when an election times out (ends)
1211 static void ctdb_election_timeout(struct tevent_context *ev,
1212 struct tevent_timer *te,
1213 struct timeval t, void *p)
1215 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1216 rec->election_timeout = NULL;
1219 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1224 wait for an election to finish. It finished election_timeout seconds after
1225 the last election packet is received
1227 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1229 struct ctdb_context *ctdb = rec->ctdb;
1230 while (rec->election_timeout) {
1231 tevent_loop_once(ctdb->ev);
1236 Update our local flags from all remote connected nodes.
1237 This is only run when we are or we belive we are the recovery master
1239 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
1242 struct ctdb_context *ctdb = rec->ctdb;
1243 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1245 /* get the nodemap for all active remote nodes and verify
1246 they are the same as for this node
1248 for (j=0; j<nodemap->num; j++) {
1249 struct ctdb_node_map_old *remote_nodemap=NULL;
1252 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1255 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1259 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1260 mem_ctx, &remote_nodemap);
1262 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1263 nodemap->nodes[j].pnn));
1264 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1265 talloc_free(mem_ctx);
1266 return MONITOR_FAILED;
1268 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1269 /* We should tell our daemon about this so it
1270 updates its flags or else we will log the same
1271 message again in the next iteration of recovery.
1272 Since we are the recovery master we can just as
1273 well update the flags on all nodes.
1275 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1277 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1281 /* Update our local copy of the flags in the recovery
1284 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1285 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1286 nodemap->nodes[j].flags));
1287 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1289 talloc_free(remote_nodemap);
1291 talloc_free(mem_ctx);
1296 /* Create a new random generation id.
1297 The generation id can not be the INVALID_GENERATION id
1299 static uint32_t new_generation(void)
1301 uint32_t generation;
1304 generation = random();
1306 if (generation != INVALID_GENERATION) {
1316 create a temporary working database
1318 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1321 struct tdb_wrap *recdb;
1324 /* open up the temporary recovery database */
1325 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1326 ctdb->db_directory_state,
1333 tdb_flags = TDB_NOLOCK;
1334 if (ctdb->valgrinding) {
1335 tdb_flags |= TDB_NOMMAP;
1337 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1339 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1340 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1341 if (recdb == NULL) {
1342 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1352 a traverse function for pulling all relevant records from recdb
1355 struct ctdb_context *ctdb;
1356 struct ctdb_marshall_buffer *recdata;
1358 uint32_t allocated_len;
1363 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1365 struct recdb_data *params = (struct recdb_data *)p;
1366 struct ctdb_rec_data_old *recdata;
1367 struct ctdb_ltdb_header *hdr;
1370 * skip empty records - but NOT for persistent databases:
1372 * The record-by-record mode of recovery deletes empty records.
1373 * For persistent databases, this can lead to data corruption
1374 * by deleting records that should be there:
1376 * - Assume the cluster has been running for a while.
1378 * - A record R in a persistent database has been created and
1379 * deleted a couple of times, the last operation being deletion,
1380 * leaving an empty record with a high RSN, say 10.
1382 * - Now a node N is turned off.
1384 * - This leaves the local database copy of D on N with the empty
1385 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1386 * the copy of record R.
1388 * - Now the record is created again while node N is turned off.
1389 * This creates R with RSN = 1 on all nodes except for N.
1391 * - Now node N is turned on again. The following recovery will chose
1392 * the older empty copy of R due to RSN 10 > RSN 1.
1394 * ==> Hence the record is gone after the recovery.
1396 * On databases like Samba's registry, this can damage the higher-level
1397 * data structures built from the various tdb-level records.
1399 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1403 /* update the dmaster field to point to us */
1404 hdr = (struct ctdb_ltdb_header *)data.dptr;
1405 if (!params->persistent) {
1406 hdr->dmaster = params->ctdb->pnn;
1407 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1410 /* add the record to the blob ready to send to the nodes */
1411 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1412 if (recdata == NULL) {
1413 params->failed = true;
1416 if (params->len + recdata->length >= params->allocated_len) {
1417 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1418 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1420 if (params->recdata == NULL) {
1421 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1422 recdata->length + params->len));
1423 params->failed = true;
1426 params->recdata->count++;
1427 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1428 params->len += recdata->length;
1429 talloc_free(recdata);
1435 push the recdb database out to all nodes
1437 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1439 struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
1441 struct recdb_data params;
1442 struct ctdb_marshall_buffer *recdata;
1444 TALLOC_CTX *tmp_ctx;
1447 tmp_ctx = talloc_new(ctdb);
1448 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1450 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1451 CTDB_NO_MEMORY(ctdb, recdata);
1453 recdata->db_id = dbid;
1456 params.recdata = recdata;
1457 params.len = offsetof(struct ctdb_marshall_buffer, data);
1458 params.allocated_len = params.len;
1459 params.failed = false;
1460 params.persistent = persistent;
1462 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1463 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1464 talloc_free(params.recdata);
1465 talloc_free(tmp_ctx);
1469 if (params.failed) {
1470 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1471 talloc_free(params.recdata);
1472 talloc_free(tmp_ctx);
1476 recdata = params.recdata;
1478 outdata.dptr = (void *)recdata;
1479 outdata.dsize = params.len;
1481 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1482 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1484 CONTROL_TIMEOUT(), false, outdata,
1487 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1488 talloc_free(recdata);
1489 talloc_free(tmp_ctx);
1493 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1494 dbid, recdata->count));
1496 talloc_free(recdata);
1497 talloc_free(tmp_ctx);
1504 go through a full recovery on one database
1506 static int recover_database(struct ctdb_recoverd *rec,
1507 TALLOC_CTX *mem_ctx,
1511 struct ctdb_node_map_old *nodemap,
1512 uint32_t transaction_id)
1514 struct tdb_wrap *recdb;
1516 struct ctdb_context *ctdb = rec->ctdb;
1518 struct ctdb_control_transdb w;
1521 recdb = create_recdb(ctdb, mem_ctx);
1522 if (recdb == NULL) {
1526 /* pull all remote databases onto the recdb */
1527 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1529 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1533 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1535 /* wipe all the remote databases. This is safe as we are in a transaction */
1537 w.transaction_id = transaction_id;
1539 data.dptr = (void *)&w;
1540 data.dsize = sizeof(w);
1542 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1543 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1545 CONTROL_TIMEOUT(), false, data,
1548 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1553 /* push out the correct database. This sets the dmaster and skips
1554 the empty records */
1555 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1561 /* all done with this database */
1567 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1568 struct ctdb_recoverd *rec,
1569 struct ctdb_node_map_old *nodemap,
1575 if (ctdb->num_nodes != nodemap->num) {
1576 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1577 ctdb->num_nodes, nodemap->num));
1579 *culprit = ctdb->pnn;
1584 for (j=0; j<nodemap->num; j++) {
1585 /* For readability */
1586 struct ctdb_node *node = ctdb->nodes[j];
1588 /* release any existing data */
1589 if (node->known_public_ips) {
1590 talloc_free(node->known_public_ips);
1591 node->known_public_ips = NULL;
1593 if (node->available_public_ips) {
1594 talloc_free(node->available_public_ips);
1595 node->available_public_ips = NULL;
1598 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1602 /* Retrieve the list of known public IPs from the node */
1603 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1608 &node->known_public_ips);
1611 ("Failed to read known public IPs from node: %u\n",
1614 *culprit = node->pnn;
1619 if (ctdb->do_checkpublicip &&
1620 !ctdb_op_is_disabled(rec->takeover_run) &&
1621 verify_remote_ip_allocation(ctdb,
1622 node->known_public_ips,
1624 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1625 rec->need_takeover_run = true;
1628 /* Retrieve the list of available public IPs from the node */
1629 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1633 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1634 &node->available_public_ips);
1637 ("Failed to read available public IPs from node: %u\n",
1640 *culprit = node->pnn;
1649 /* when we start a recovery, make sure all nodes use the same reclock file
1652 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1654 struct ctdb_context *ctdb = rec->ctdb;
1655 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1659 if (ctdb->recovery_lock_file == NULL) {
1663 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1664 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1667 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1668 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1674 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1675 talloc_free(tmp_ctx);
1679 talloc_free(tmp_ctx);
1685 * this callback is called for every node that failed to execute ctdb_takeover_run()
1686 * and set flag to re-run takeover run.
1688 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1690 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1692 if (callback_data != NULL) {
1693 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1695 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1697 ctdb_set_culprit(rec, node_pnn);
1702 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1704 struct ctdb_context *ctdb = rec->ctdb;
1706 struct ctdb_banning_state *ban_state;
1709 for (i=0; i<ctdb->num_nodes; i++) {
1710 if (ctdb->nodes[i]->ban_state == NULL) {
1713 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1714 if (ban_state->count < 2*ctdb->num_nodes) {
1718 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1719 ctdb->nodes[i]->pnn, ban_state->count,
1720 ctdb->tunable.recovery_ban_period));
1721 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1722 ban_state->count = 0;
1724 /* Banning ourself? */
1725 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1731 static bool do_takeover_run(struct ctdb_recoverd *rec,
1732 struct ctdb_node_map_old *nodemap,
1733 bool banning_credits_on_fail)
1735 uint32_t *nodes = NULL;
1736 struct srvid_request_data dtr;
1739 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1743 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1745 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1746 DEBUG(DEBUG_ERR, (__location__
1747 " takeover run already in progress \n"));
1752 if (!ctdb_op_begin(rec->takeover_run)) {
1757 /* Disable IP checks (takeover runs, really) on other nodes
1758 * while doing this takeover run. This will stop those other
1759 * nodes from triggering takeover runs when think they should
1760 * be hosting an IP but it isn't yet on an interface. Don't
1761 * wait for replies since a failure here might cause some
1762 * noise in the logs but will not actually cause a problem.
1764 dtr.srvid = 0; /* No reply */
1767 data.dptr = (uint8_t*)&dtr;
1768 data.dsize = sizeof(dtr);
1770 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1772 /* Disable for 60 seconds. This can be a tunable later if
1776 for (i = 0; i < talloc_array_length(nodes); i++) {
1777 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1778 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1780 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1784 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1785 rec->force_rebalance_nodes,
1786 takeover_fail_callback,
1787 banning_credits_on_fail ? rec : NULL);
1789 /* Reenable takeover runs and IP checks on other nodes */
1791 for (i = 0; i < talloc_array_length(nodes); i++) {
1792 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1793 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1795 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1800 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1806 /* Takeover run was successful so clear force rebalance targets */
1807 if (rebalance_nodes == rec->force_rebalance_nodes) {
1808 TALLOC_FREE(rec->force_rebalance_nodes);
1810 DEBUG(DEBUG_WARNING,
1811 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1814 rec->need_takeover_run = !ok;
1816 ctdb_op_end(rec->takeover_run);
1818 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1822 struct recovery_helper_state {
1829 static void ctdb_recovery_handler(struct tevent_context *ev,
1830 struct tevent_fd *fde,
1831 uint16_t flags, void *private_data)
1833 struct recovery_helper_state *state = talloc_get_type_abort(
1834 private_data, struct recovery_helper_state);
1837 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1838 if (ret != sizeof(state->result)) {
1839 state->result = EPIPE;
1846 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1848 static char prog[PATH_MAX+1] = "";
1850 struct recovery_helper_state *state;
1851 struct tevent_fd *fde;
1854 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1855 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1856 "ctdb_recovery_helper")) {
1857 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1860 state = talloc_zero(mem_ctx, struct recovery_helper_state);
1861 if (state == NULL) {
1862 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1868 ret = pipe(state->fd);
1871 ("Failed to create pipe for recovery helper\n"));
1875 set_close_on_exec(state->fd[0]);
1878 args = talloc_array(state, const char *, nargs);
1880 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1884 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1885 args[1] = rec->ctdb->daemon.name;
1886 args[2] = talloc_asprintf(args, "%u", new_generation());
1889 if (args[0] == NULL || args[2] == NULL) {
1890 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1894 if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1895 args, NULL, NULL, &state->pid)) {
1897 ("Failed to create child for recovery helper\n"));
1901 close(state->fd[1]);
1904 state->done = false;
1906 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1907 TEVENT_FD_READ, ctdb_recovery_handler, state);
1911 tevent_fd_set_auto_close(fde);
1913 while (!state->done) {
1914 tevent_loop_once(rec->ctdb->ev);
1917 close(state->fd[0]);
1920 if (state->result != 0) {
1924 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1929 if (state->fd[0] != -1) {
1930 close(state->fd[0]);
1932 if (state->fd[1] != -1) {
1933 close(state->fd[1]);
1935 if (state->pid != -1) {
1936 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1942 static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1943 uint32_t pnn, struct ctdb_node_map_old *nodemap,
1944 struct ctdb_vnn_map *vnnmap,
1945 struct ctdb_dbid_map_old *dbmap)
1947 struct ctdb_context *ctdb = rec->ctdb;
1948 uint32_t generation;
1953 /* set recovery mode to active on all nodes */
1954 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1956 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1960 /* execute the "startrecovery" event script on all nodes */
1961 ret = run_startrecovery_eventscript(rec, nodemap);
1963 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1967 /* pick a new generation number */
1968 generation = new_generation();
1970 /* change the vnnmap on this node to use the new generation
1971 number but not on any other nodes.
1972 this guarantees that if we abort the recovery prematurely
1973 for some reason (a node stops responding?)
1974 that we can just return immediately and we will reenter
1975 recovery shortly again.
1976 I.e. we deliberately leave the cluster with an inconsistent
1977 generation id to allow us to abort recovery at any stage and
1978 just restart it from scratch.
1980 vnnmap->generation = generation;
1981 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1983 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1987 /* Database generations are updated when the transaction is commited to
1988 * the databases. So make sure to use the final generation as the
1991 generation = new_generation();
1993 data.dptr = (void *)&generation;
1994 data.dsize = sizeof(uint32_t);
1996 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1997 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1999 CONTROL_TIMEOUT(), false, data,
2001 transaction_start_fail_callback,
2003 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
2004 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
2006 CONTROL_TIMEOUT(), false, tdb_null,
2010 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
2015 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
2017 for (i=0;i<dbmap->num;i++) {
2018 ret = recover_database(rec, mem_ctx,
2019 dbmap->dbs[i].db_id,
2020 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
2021 pnn, nodemap, generation);
2023 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
2028 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2030 /* commit all the changes */
2031 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2033 CONTROL_TIMEOUT(), false, data,
2036 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2040 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2042 /* build a new vnn map with all the currently active and
2044 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2045 CTDB_NO_MEMORY(ctdb, vnnmap);
2046 vnnmap->generation = generation;
2048 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2049 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2050 for (i=j=0;i<nodemap->num;i++) {
2051 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2054 if (!ctdb_node_has_capabilities(rec->caps,
2055 ctdb->nodes[i]->pnn,
2056 CTDB_CAP_LMASTER)) {
2057 /* this node can not be an lmaster */
2058 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2063 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2064 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2065 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2068 if (vnnmap->size == 0) {
2069 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2071 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2072 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2073 vnnmap->map[0] = pnn;
2076 /* update to the new vnnmap on all nodes */
2077 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2079 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2083 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2085 /* update recmaster to point to us for all nodes */
2086 ret = set_recovery_master(ctdb, nodemap, pnn);
2088 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2092 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2094 /* disable recovery mode */
2095 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
2097 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2101 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2107 we are the recmaster, and recovery is needed - start a recovery run
2109 static int do_recovery(struct ctdb_recoverd *rec,
2110 TALLOC_CTX *mem_ctx, uint32_t pnn,
2111 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
2113 struct ctdb_context *ctdb = rec->ctdb;
2115 struct ctdb_dbid_map_old *dbmap;
2116 struct timeval start_time;
2117 uint32_t culprit = (uint32_t)-1;
2121 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2123 /* Check if the current node is still the recmaster. It's possible that
2124 * re-election has changed the recmaster, but we have not yet updated
2127 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2128 pnn, &ctdb->recovery_master);
2130 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster\n"));
2134 if (pnn != ctdb->recovery_master) {
2136 ("Recovery master changed to %u, aborting recovery\n",
2137 ctdb->recovery_master));
2141 /* if recovery fails, force it again */
2142 rec->need_recovery = true;
2144 if (!ctdb_op_begin(rec->recovery)) {
2148 if (rec->election_timeout) {
2149 /* an election is in progress */
2150 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2154 ban_misbehaving_nodes(rec, &self_ban);
2156 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2160 if (ctdb->recovery_lock_file != NULL) {
2161 if (ctdb_recovery_have_lock(ctdb)) {
2162 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2164 start_time = timeval_current();
2165 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2166 ctdb->recovery_lock_file));
2167 if (!ctdb_recovery_lock(ctdb)) {
2168 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2169 /* If ctdb is trying first recovery, it's
2170 * possible that current node does not know
2171 * yet who the recmaster is.
2173 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2174 " - retrying recovery\n"));
2178 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2179 "and ban ourself for %u seconds\n",
2180 ctdb->tunable.recovery_ban_period));
2181 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2184 ctdb_ctrl_report_recd_lock_latency(ctdb,
2186 timeval_elapsed(&start_time));
2188 ("Recovery lock taken successfully by recovery daemon\n"));
2192 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2194 /* get a list of all databases */
2195 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2197 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2201 /* we do the db creation before we set the recovery mode, so the freeze happens
2202 on all databases we will be dealing with. */
2204 /* verify that we have all the databases any other node has */
2205 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2207 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2211 /* verify that all other nodes have all our databases */
2212 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2214 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2217 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2219 /* update the database priority for all remote databases */
2220 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2222 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2224 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2227 /* update all other nodes to use the same setting for reclock files
2228 as the local recovery master.
2230 sync_recovery_lock_file_across_cluster(rec);
2232 /* update the capabilities for all nodes */
2233 ret = update_capabilities(rec, nodemap);
2235 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2240 update all nodes to have the same flags that we have
2242 for (i=0;i<nodemap->num;i++) {
2243 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2247 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2249 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2250 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2252 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2258 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2260 /* Check if all participating nodes have parallel recovery capability */
2261 par_recovery = true;
2262 for (i=0; i<nodemap->num; i++) {
2263 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2267 if (!(rec->caps[i].capabilities &
2268 CTDB_CAP_PARALLEL_RECOVERY)) {
2269 par_recovery = false;
2275 ret = db_recovery_parallel(rec, mem_ctx);
2277 ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2285 /* Fetch known/available public IPs from each active node */
2286 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2288 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2290 rec->need_takeover_run = true;
2294 do_takeover_run(rec, nodemap, false);
2296 /* execute the "recovered" event script on all nodes */
2297 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2299 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2303 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2305 /* send a message to all clients telling them that the cluster
2306 has been reconfigured */
2307 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2308 CTDB_SRVID_RECONFIGURE, tdb_null);
2310 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2314 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2316 rec->need_recovery = false;
2317 ctdb_op_end(rec->recovery);
2319 /* we managed to complete a full recovery, make sure to forgive
2320 any past sins by the nodes that could now participate in the
2323 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2324 for (i=0;i<nodemap->num;i++) {
2325 struct ctdb_banning_state *ban_state;
2327 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2331 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2332 if (ban_state == NULL) {
2336 ban_state->count = 0;
2339 /* We just finished a recovery successfully.
2340 We now wait for rerecovery_timeout before we allow
2341 another recovery to take place.
2343 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2344 ctdb_op_disable(rec->recovery, ctdb->ev,
2345 ctdb->tunable.rerecovery_timeout);
2349 ctdb_op_end(rec->recovery);
2355 elections are won by first checking the number of connected nodes, then
2356 the priority time, then the pnn
2358 struct election_message {
2359 uint32_t num_connected;
2360 struct timeval priority_time;
2362 uint32_t node_flags;
2366 form this nodes election data
2368 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2371 struct ctdb_node_map_old *nodemap;
2372 struct ctdb_context *ctdb = rec->ctdb;
2376 em->pnn = rec->ctdb->pnn;
2377 em->priority_time = rec->priority_time;
2379 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2381 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2385 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2386 em->node_flags = rec->node_flags;
2388 for (i=0;i<nodemap->num;i++) {
2389 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2390 em->num_connected++;
2394 /* we shouldnt try to win this election if we cant be a recmaster */
2395 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2396 em->num_connected = 0;
2397 em->priority_time = timeval_current();
2400 talloc_free(nodemap);
2404 see if the given election data wins
2406 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2408 struct election_message myem;
2411 ctdb_election_data(rec, &myem);
2413 /* we cant win if we dont have the recmaster capability */
2414 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2418 /* we cant win if we are banned */
2419 if (rec->node_flags & NODE_FLAGS_BANNED) {
2423 /* we cant win if we are stopped */
2424 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2428 /* we will automatically win if the other node is banned */
2429 if (em->node_flags & NODE_FLAGS_BANNED) {
2433 /* we will automatically win if the other node is banned */
2434 if (em->node_flags & NODE_FLAGS_STOPPED) {
2438 /* then the longest running node */
2440 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2444 cmp = (int)myem.pnn - (int)em->pnn;
2451 send out an election request
2453 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2456 TDB_DATA election_data;
2457 struct election_message emsg;
2459 struct ctdb_context *ctdb = rec->ctdb;
2461 srvid = CTDB_SRVID_RECOVERY;
2463 ctdb_election_data(rec, &emsg);
2465 election_data.dsize = sizeof(struct election_message);
2466 election_data.dptr = (unsigned char *)&emsg;
2469 /* first we assume we will win the election and set
2470 recoverymaster to be ourself on the current node
2472 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2474 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2479 /* send an election message to all active nodes */
2480 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2481 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2485 this function will unban all nodes in the cluster
2487 static void unban_all_nodes(struct ctdb_context *ctdb)
2490 struct ctdb_node_map_old *nodemap;
2491 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2493 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2495 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2499 for (i=0;i<nodemap->num;i++) {
2500 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2501 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2502 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2503 nodemap->nodes[i].pnn, 0,
2506 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2511 talloc_free(tmp_ctx);
2516 we think we are winning the election - send a broadcast election request
2518 static void election_send_request(struct tevent_context *ev,
2519 struct tevent_timer *te,
2520 struct timeval t, void *p)
2522 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2525 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2527 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2530 talloc_free(rec->send_election_te);
2531 rec->send_election_te = NULL;
2535 handler for memory dumps
2537 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2539 struct ctdb_recoverd *rec = talloc_get_type(
2540 private_data, struct ctdb_recoverd);
2541 struct ctdb_context *ctdb = rec->ctdb;
2542 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2545 struct srvid_request *rd;
2547 if (data.dsize != sizeof(struct srvid_request)) {
2548 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2549 talloc_free(tmp_ctx);
2552 rd = (struct srvid_request *)data.dptr;
2554 dump = talloc_zero(tmp_ctx, TDB_DATA);
2556 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2557 talloc_free(tmp_ctx);
2560 ret = ctdb_dump_memory(ctdb, dump);
2562 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2563 talloc_free(tmp_ctx);
2567 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2569 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2571 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2572 talloc_free(tmp_ctx);
2576 talloc_free(tmp_ctx);
2580 handler for reload_nodes
2582 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2585 struct ctdb_recoverd *rec = talloc_get_type(
2586 private_data, struct ctdb_recoverd);
2588 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2590 ctdb_load_nodes_file(rec->ctdb);
2594 static void ctdb_rebalance_timeout(struct tevent_context *ev,
2595 struct tevent_timer *te,
2596 struct timeval t, void *p)
2598 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2600 if (rec->force_rebalance_nodes == NULL) {
2602 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2607 ("Rebalance timeout occurred - do takeover run\n"));
2608 do_takeover_run(rec, rec->nodemap, false);
2612 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2615 struct ctdb_recoverd *rec = talloc_get_type(
2616 private_data, struct ctdb_recoverd);
2617 struct ctdb_context *ctdb = rec->ctdb;
2621 uint32_t deferred_rebalance;
2623 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2627 if (data.dsize != sizeof(uint32_t)) {
2628 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2632 pnn = *(uint32_t *)&data.dptr[0];
2634 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2636 /* Copy any existing list of nodes. There's probably some
2637 * sort of realloc variant that will do this but we need to
2638 * make sure that freeing the old array also cancels the timer
2639 * event for the timeout... not sure if realloc will do that.
2641 len = (rec->force_rebalance_nodes != NULL) ?
2642 talloc_array_length(rec->force_rebalance_nodes) :
2645 /* This allows duplicates to be added but they don't cause
2646 * harm. A call to add a duplicate PNN arguably means that
2647 * the timeout should be reset, so this is the simplest
2650 t = talloc_zero_array(rec, uint32_t, len+1);
2651 CTDB_NO_MEMORY_VOID(ctdb, t);
2653 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2657 talloc_free(rec->force_rebalance_nodes);
2659 rec->force_rebalance_nodes = t;
2661 /* If configured, setup a deferred takeover run to make sure
2662 * that certain nodes get IPs rebalanced to them. This will
2663 * be cancelled if a successful takeover run happens before
2664 * the timeout. Assign tunable value to variable for
2667 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2668 if (deferred_rebalance != 0) {
2669 tevent_add_timer(ctdb->ev, rec->force_rebalance_nodes,
2670 timeval_current_ofs(deferred_rebalance, 0),
2671 ctdb_rebalance_timeout, rec);
2677 static void recd_update_ip_handler(uint64_t srvid, TDB_DATA data,
2680 struct ctdb_recoverd *rec = talloc_get_type(
2681 private_data, struct ctdb_recoverd);
2682 struct ctdb_public_ip *ip;
2684 if (rec->recmaster != rec->ctdb->pnn) {
2685 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2689 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2690 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2694 ip = (struct ctdb_public_ip *)data.dptr;
2696 update_ip_assignment_tree(rec->ctdb, ip);
2699 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2701 struct ctdb_op_state *op_state)
2703 struct srvid_request_data *r;
2708 /* Validate input data */
2709 if (data.dsize != sizeof(struct srvid_request_data)) {
2710 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2711 "expecting %lu\n", (long unsigned)data.dsize,
2712 (long unsigned)sizeof(struct srvid_request)));
2715 if (data.dptr == NULL) {
2716 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2720 r = (struct srvid_request_data *)data.dptr;
2723 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2728 /* Returning our PNN tells the caller that we succeeded */
2729 ret = ctdb_get_pnn(ctdb);
2731 result.dsize = sizeof(int32_t);
2732 result.dptr = (uint8_t *)&ret;
2733 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2736 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2739 struct ctdb_recoverd *rec = talloc_get_type(
2740 private_data, struct ctdb_recoverd);
2742 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2745 /* Backward compatibility for this SRVID */
2746 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2749 struct ctdb_recoverd *rec = talloc_get_type(
2750 private_data, struct ctdb_recoverd);
2753 if (data.dsize != sizeof(uint32_t)) {
2754 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2755 "expecting %lu\n", (long unsigned)data.dsize,
2756 (long unsigned)sizeof(uint32_t)));
2759 if (data.dptr == NULL) {
2760 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2764 timeout = *((uint32_t *)data.dptr);
2766 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2769 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2772 struct ctdb_recoverd *rec = talloc_get_type(
2773 private_data, struct ctdb_recoverd);
2775 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2779 handler for ip reallocate, just add it to the list of requests and
2780 handle this later in the monitor_cluster loop so we do not recurse
2781 with other requests to takeover_run()
2783 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2786 struct srvid_request *request;
2787 struct ctdb_recoverd *rec = talloc_get_type(
2788 private_data, struct ctdb_recoverd);
2790 if (data.dsize != sizeof(struct srvid_request)) {
2791 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2795 request = (struct srvid_request *)data.dptr;
2797 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2800 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2801 struct ctdb_recoverd *rec)
2806 struct srvid_requests *current;
2808 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2810 /* Only process requests that are currently pending. More
2811 * might come in while the takeover run is in progress and
2812 * they will need to be processed later since they might
2813 * be in response flag changes.
2815 current = rec->reallocate_requests;
2816 rec->reallocate_requests = NULL;
2818 /* update the list of public ips that a node can handle for
2821 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2823 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2825 rec->need_takeover_run = true;
2828 if (do_takeover_run(rec, rec->nodemap, false)) {
2829 ret = ctdb_get_pnn(ctdb);
2835 result.dsize = sizeof(int32_t);
2836 result.dptr = (uint8_t *)&ret;
2838 srvid_requests_reply(ctdb, ¤t, result);
2843 handler for recovery master elections
2845 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2847 struct ctdb_recoverd *rec = talloc_get_type(
2848 private_data, struct ctdb_recoverd);
2849 struct ctdb_context *ctdb = rec->ctdb;
2851 struct election_message *em = (struct election_message *)data.dptr;
2853 /* Ignore election packets from ourself */
2854 if (ctdb->pnn == em->pnn) {
2858 /* we got an election packet - update the timeout for the election */
2859 talloc_free(rec->election_timeout);
2860 rec->election_timeout = tevent_add_timer(
2863 timeval_current_ofs(0, 500000) :
2864 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2865 ctdb_election_timeout, rec);
2867 /* someone called an election. check their election data
2868 and if we disagree and we would rather be the elected node,
2869 send a new election message to all other nodes
2871 if (ctdb_election_win(rec, em)) {
2872 if (!rec->send_election_te) {
2873 rec->send_election_te = tevent_add_timer(
2875 timeval_current_ofs(0, 500000),
2876 election_send_request, rec);
2878 /*unban_all_nodes(ctdb);*/
2883 TALLOC_FREE(rec->send_election_te);
2885 /* Release the recovery lock file */
2886 if (ctdb_recovery_have_lock(ctdb)) {
2887 ctdb_recovery_unlock(ctdb);
2888 unban_all_nodes(ctdb);
2891 clear_ip_assignment_tree(ctdb);
2893 /* ok, let that guy become recmaster then */
2894 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2896 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2905 force the start of the election process
2907 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2908 struct ctdb_node_map_old *nodemap)
2911 struct ctdb_context *ctdb = rec->ctdb;
2913 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2915 /* set all nodes to recovery mode to stop all internode traffic */
2916 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2918 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2922 talloc_free(rec->election_timeout);
2923 rec->election_timeout = tevent_add_timer(
2926 timeval_current_ofs(0, 500000) :
2927 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2928 ctdb_election_timeout, rec);
2930 ret = send_election_request(rec, pnn);
2932 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2936 /* wait for a few seconds to collect all responses */
2937 ctdb_wait_election(rec);
2943 handler for when a node changes its flags
2945 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2947 struct ctdb_recoverd *rec = talloc_get_type(
2948 private_data, struct ctdb_recoverd);
2949 struct ctdb_context *ctdb = rec->ctdb;
2951 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2952 struct ctdb_node_map_old *nodemap=NULL;
2953 TALLOC_CTX *tmp_ctx;
2955 int disabled_flag_changed;
2957 if (data.dsize != sizeof(*c)) {
2958 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2962 tmp_ctx = talloc_new(ctdb);
2963 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2965 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2967 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2968 talloc_free(tmp_ctx);
2973 for (i=0;i<nodemap->num;i++) {
2974 if (nodemap->nodes[i].pnn == c->pnn) break;
2977 if (i == nodemap->num) {
2978 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2979 talloc_free(tmp_ctx);
2983 if (c->old_flags != c->new_flags) {
2984 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2987 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2989 nodemap->nodes[i].flags = c->new_flags;
2991 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2992 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2995 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2996 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3000 ctdb->recovery_master == ctdb->pnn &&
3001 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3002 /* Only do the takeover run if the perm disabled or unhealthy
3003 flags changed since these will cause an ip failover but not
3005 If the node became disconnected or banned this will also
3006 lead to an ip address failover but that is handled
3009 if (disabled_flag_changed) {
3010 rec->need_takeover_run = true;
3014 talloc_free(tmp_ctx);
3018 handler for when we need to push out flag changes ot all other nodes
3020 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
3023 struct ctdb_recoverd *rec = talloc_get_type(
3024 private_data, struct ctdb_recoverd);
3025 struct ctdb_context *ctdb = rec->ctdb;
3027 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
3028 struct ctdb_node_map_old *nodemap=NULL;
3029 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
3033 /* find the recovery master */
3034 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
3036 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
3037 talloc_free(tmp_ctx);
3041 /* read the node flags from the recmaster */
3042 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
3044 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
3045 talloc_free(tmp_ctx);
3048 if (c->pnn >= nodemap->num) {
3049 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
3050 talloc_free(tmp_ctx);
3054 /* send the flags update to all connected nodes */
3055 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
3057 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
3058 nodes, 0, CONTROL_TIMEOUT(),
3062 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
3064 talloc_free(tmp_ctx);
3068 talloc_free(tmp_ctx);
3072 struct verify_recmode_normal_data {
3074 enum monitor_result status;
3077 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
3079 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
3082 /* one more node has responded with recmode data*/
3085 /* if we failed to get the recmode, then return an error and let
3086 the main loop try again.
3088 if (state->state != CTDB_CONTROL_DONE) {
3089 if (rmdata->status == MONITOR_OK) {
3090 rmdata->status = MONITOR_FAILED;
3095 /* if we got a response, then the recmode will be stored in the
3098 if (state->status != CTDB_RECOVERY_NORMAL) {
3099 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
3100 rmdata->status = MONITOR_RECOVERY_NEEDED;
3107 /* verify that all nodes are in normal recovery mode */
3108 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
3110 struct verify_recmode_normal_data *rmdata;
3111 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3112 struct ctdb_client_control_state *state;
3113 enum monitor_result status;
3116 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
3117 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3119 rmdata->status = MONITOR_OK;
3121 /* loop over all active nodes and send an async getrecmode call to
3123 for (j=0; j<nodemap->num; j++) {
3124 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3127 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
3129 nodemap->nodes[j].pnn);
3130 if (state == NULL) {
3131 /* we failed to send the control, treat this as
3132 an error and try again next iteration
3134 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
3135 talloc_free(mem_ctx);
3136 return MONITOR_FAILED;
3139 /* set up the callback functions */
3140 state->async.fn = verify_recmode_normal_callback;
3141 state->async.private_data = rmdata;
3143 /* one more control to wait for to complete */
3148 /* now wait for up to the maximum number of seconds allowed
3149 or until all nodes we expect a response from has replied
3151 while (rmdata->count > 0) {
3152 tevent_loop_once(ctdb->ev);
3155 status = rmdata->status;
3156 talloc_free(mem_ctx);
3161 struct verify_recmaster_data {
3162 struct ctdb_recoverd *rec;
3165 enum monitor_result status;
3168 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3170 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3173 /* one more node has responded with recmaster data*/
3176 /* if we failed to get the recmaster, then return an error and let
3177 the main loop try again.
3179 if (state->state != CTDB_CONTROL_DONE) {
3180 if (rmdata->status == MONITOR_OK) {
3181 rmdata->status = MONITOR_FAILED;
3186 /* if we got a response, then the recmaster will be stored in the
3189 if (state->status != rmdata->pnn) {
3190 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3191 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3192 rmdata->status = MONITOR_ELECTION_NEEDED;
3199 /* verify that all nodes agree that we are the recmaster */
3200 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
3202 struct ctdb_context *ctdb = rec->ctdb;
3203 struct verify_recmaster_data *rmdata;
3204 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3205 struct ctdb_client_control_state *state;
3206 enum monitor_result status;
3209 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3210 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3214 rmdata->status = MONITOR_OK;
3216 /* loop over all active nodes and send an async getrecmaster call to
3218 for (j=0; j<nodemap->num; j++) {
3219 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3222 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3224 nodemap->nodes[j].pnn);
3225 if (state == NULL) {
3226 /* we failed to send the control, treat this as
3227 an error and try again next iteration
3229 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3230 talloc_free(mem_ctx);
3231 return MONITOR_FAILED;
3234 /* set up the callback functions */
3235 state->async.fn = verify_recmaster_callback;
3236 state->async.private_data = rmdata;
3238 /* one more control to wait for to complete */
3243 /* now wait for up to the maximum number of seconds allowed
3244 or until all nodes we expect a response from has replied
3246 while (rmdata->count > 0) {
3247 tevent_loop_once(ctdb->ev);
3250 status = rmdata->status;
3251 talloc_free(mem_ctx);
3255 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3256 struct ctdb_recoverd *rec)
3258 struct ctdb_control_get_ifaces *ifaces = NULL;
3259 TALLOC_CTX *mem_ctx;
3262 mem_ctx = talloc_new(NULL);
3264 /* Read the interfaces from the local node */
3265 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3266 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3267 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3268 /* We could return an error. However, this will be
3269 * rare so we'll decide that the interfaces have
3270 * actually changed, just in case.
3272 talloc_free(mem_ctx);
3277 /* We haven't been here before so things have changed */
3278 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3280 } else if (rec->ifaces->num != ifaces->num) {
3281 /* Number of interfaces has changed */
3282 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3283 rec->ifaces->num, ifaces->num));
3286 /* See if interface names or link states have changed */
3288 for (i = 0; i < rec->ifaces->num; i++) {
3289 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3290 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3292 ("Interface in slot %d changed: %s => %s\n",
3293 i, iface->name, ifaces->ifaces[i].name));
3297 if (iface->link_state != ifaces->ifaces[i].link_state) {
3299 ("Interface %s changed state: %d => %d\n",
3300 iface->name, iface->link_state,
3301 ifaces->ifaces[i].link_state));
3308 talloc_free(rec->ifaces);
3309 rec->ifaces = talloc_steal(rec, ifaces);
3311 talloc_free(mem_ctx);
3315 /* called to check that the local allocation of public ip addresses is ok.
3317 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map_old *nodemap)
3319 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3320 struct ctdb_uptime *uptime1 = NULL;
3321 struct ctdb_uptime *uptime2 = NULL;
3323 bool need_takeover_run = false;
3325 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3326 CTDB_CURRENT_NODE, &uptime1);
3328 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3329 talloc_free(mem_ctx);
3333 if (interfaces_have_changed(ctdb, rec)) {
3334 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3335 "local node %u - force takeover run\n",
3337 need_takeover_run = true;
3340 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3341 CTDB_CURRENT_NODE, &uptime2);
3343 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3344 talloc_free(mem_ctx);
3348 /* skip the check if the startrecovery time has changed */
3349 if (timeval_compare(&uptime1->last_recovery_started,
3350 &uptime2->last_recovery_started) != 0) {
3351 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3352 talloc_free(mem_ctx);
3356 /* skip the check if the endrecovery time has changed */
3357 if (timeval_compare(&uptime1->last_recovery_finished,
3358 &uptime2->last_recovery_finished) != 0) {
3359 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3360 talloc_free(mem_ctx);
3364 /* skip the check if we have started but not finished recovery */
3365 if (timeval_compare(&uptime1->last_recovery_finished,
3366 &uptime1->last_recovery_started) != 1) {
3367 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3368 talloc_free(mem_ctx);
3373 /* verify that we have the ip addresses we should have
3374 and we dont have ones we shouldnt have.
3375 if we find an inconsistency we set recmode to
3376 active on the local node and wait for the recmaster
3377 to do a full blown recovery.
3378 also if the pnn is -1 and we are healthy and can host the ip
3379 we also request a ip reallocation.
3381 if (ctdb->tunable.disable_ip_failover == 0) {
3382 struct ctdb_public_ip_list_old *ips = NULL;
3384 /* read the *available* IPs from the local node */
3385 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3387 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3388 talloc_free(mem_ctx);
3392 for (j=0; j<ips->num; j++) {
3393 if (ips->ips[j].pnn == -1 &&
3394 nodemap->nodes[pnn].flags == 0) {
3395 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3396 ctdb_addr_to_str(&ips->ips[j].addr)));
3397 need_takeover_run = true;
3403 /* read the *known* IPs from the local node */
3404 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3406 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3407 talloc_free(mem_ctx);
3411 for (j=0; j<ips->num; j++) {
3412 if (ips->ips[j].pnn == pnn) {
3413 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3414 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3415 ctdb_addr_to_str(&ips->ips[j].addr)));
3416 need_takeover_run = true;
3419 if (ctdb->do_checkpublicip &&
3420 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3422 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3423 ctdb_addr_to_str(&ips->ips[j].addr)));
3425 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3426 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3433 if (need_takeover_run) {
3434 struct srvid_request rd;
3437 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3441 data.dptr = (uint8_t *)&rd;
3442 data.dsize = sizeof(rd);
3444 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3446 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3449 talloc_free(mem_ctx);
3454 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3456 struct ctdb_node_map_old **remote_nodemaps = callback_data;
3458 if (node_pnn >= ctdb->num_nodes) {
3459 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3463 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3467 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3468 struct ctdb_node_map_old *nodemap,
3469 struct ctdb_node_map_old **remote_nodemaps)
3473 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3474 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3476 CONTROL_TIMEOUT(), false, tdb_null,
3477 async_getnodemap_callback,
3479 remote_nodemaps) != 0) {
3480 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3488 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3490 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3491 const char *reclockfile;
3493 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3494 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3495 talloc_free(tmp_ctx);
3499 if (reclockfile == NULL) {
3500 if (ctdb->recovery_lock_file != NULL) {
3501 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3502 talloc_free(ctdb->recovery_lock_file);
3503 ctdb->recovery_lock_file = NULL;
3504 ctdb_recovery_unlock(ctdb);
3506 talloc_free(tmp_ctx);
3510 if (ctdb->recovery_lock_file == NULL) {
3512 ("Recovery lock file enabled (%s)\n", reclockfile));
3513 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3514 ctdb_recovery_unlock(ctdb);
3515 talloc_free(tmp_ctx);
3520 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3521 talloc_free(tmp_ctx);
3526 ("Recovery lock file changed (now %s)\n", reclockfile));
3527 talloc_free(ctdb->recovery_lock_file);
3528 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3529 ctdb_recovery_unlock(ctdb);
3531 talloc_free(tmp_ctx);
3535 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3536 TALLOC_CTX *mem_ctx)
3539 struct ctdb_node_map_old *nodemap=NULL;
3540 struct ctdb_node_map_old *recmaster_nodemap=NULL;
3541 struct ctdb_node_map_old **remote_nodemaps=NULL;
3542 struct ctdb_vnn_map *vnnmap=NULL;
3543 struct ctdb_vnn_map *remote_vnnmap=NULL;
3544 uint32_t num_lmasters;
3545 int32_t debug_level;
3550 /* verify that the main daemon is still running */
3551 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3552 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3556 /* ping the local daemon to tell it we are alive */
3557 ctdb_ctrl_recd_ping(ctdb);
3559 if (rec->election_timeout) {
3560 /* an election is in progress */
3564 /* read the debug level from the parent and update locally */
3565 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3567 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3570 DEBUGLEVEL = debug_level;
3572 /* get relevant tunables */
3573 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3575 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3580 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3581 CTDB_CURRENT_NODE, &ctdb->runstate);
3583 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3587 /* get the current recovery lock file from the server */
3588 if (update_recovery_lock_file(ctdb) != 0) {
3589 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3593 /* Make sure that if recovery lock verification becomes disabled when
3596 if (ctdb->recovery_lock_file == NULL) {
3597 ctdb_recovery_unlock(ctdb);
3600 pnn = ctdb_get_pnn(ctdb);
3602 /* get the vnnmap */
3603 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3605 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3610 /* get number of nodes */
3612 talloc_free(rec->nodemap);
3613 rec->nodemap = NULL;
3616 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3618 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3621 nodemap = rec->nodemap;
3623 /* remember our own node flags */
3624 rec->node_flags = nodemap->nodes[pnn].flags;
3626 ban_misbehaving_nodes(rec, &self_ban);
3628 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3632 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3633 also frozen and that the recmode is set to active.
3635 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3636 /* If this node has become inactive then we want to
3637 * reduce the chances of it taking over the recovery
3638 * master role when it becomes active again. This
3639 * helps to stabilise the recovery master role so that
3640 * it stays on the most stable node.
3642 rec->priority_time = timeval_current();
3644 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3646 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3648 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3649 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3651 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3653 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3657 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3659 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3664 /* If this node is stopped or banned then it is not the recovery
3665 * master, so don't do anything. This prevents stopped or banned
3666 * node from starting election and sending unnecessary controls.
3671 /* check which node is the recovery master */
3672 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3674 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3678 /* If we are not the recmaster then do some housekeeping */
3679 if (rec->recmaster != pnn) {
3680 /* Ignore any IP reallocate requests - only recmaster
3683 TALLOC_FREE(rec->reallocate_requests);
3684 /* Clear any nodes that should be force rebalanced in
3685 * the next takeover run. If the recovery master role
3686 * has moved then we don't want to process these some
3687 * time in the future.
3689 TALLOC_FREE(rec->force_rebalance_nodes);
3692 /* This is a special case. When recovery daemon is started, recmaster
3693 * is set to -1. If a node is not started in stopped state, then
3694 * start election to decide recovery master
3696 if (rec->recmaster == (uint32_t)-1) {
3697 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3698 force_election(rec, pnn, nodemap);
3702 /* update the capabilities for all nodes */
3703 ret = update_capabilities(rec, nodemap);
3705 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3710 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3711 * but we have, then force an election and try to become the new
3714 if (!ctdb_node_has_capabilities(rec->caps,
3716 CTDB_CAP_RECMASTER) &&
3717 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3718 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3719 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3720 " but we (node %u) have - force an election\n",
3721 rec->recmaster, pnn));
3722 force_election(rec, pnn, nodemap);
3726 /* verify that the recmaster node is still active */
3727 for (j=0; j<nodemap->num; j++) {
3728 if (nodemap->nodes[j].pnn==rec->recmaster) {
3733 if (j == nodemap->num) {
3734 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3735 force_election(rec, pnn, nodemap);
3739 /* if recovery master is disconnected we must elect a new recmaster */
3740 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3741 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3742 force_election(rec, pnn, nodemap);
3746 /* get nodemap from the recovery master to check if it is inactive */
3747 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3748 mem_ctx, &recmaster_nodemap);
3750 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3751 nodemap->nodes[j].pnn));
3756 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3757 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3758 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3760 * update our nodemap to carry the recmaster's notion of
3761 * its own flags, so that we don't keep freezing the
3762 * inactive recmaster node...
3764 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3765 force_election(rec, pnn, nodemap);
3769 /* verify that we have all ip addresses we should have and we dont
3770 * have addresses we shouldnt have.
3772 if (ctdb->tunable.disable_ip_failover == 0 &&
3773 !ctdb_op_is_disabled(rec->takeover_run)) {
3774 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3775 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3780 /* if we are not the recmaster then we do not need to check
3781 if recovery is needed
3783 if (pnn != rec->recmaster) {
3788 /* ensure our local copies of flags are right */
3789 ret = update_local_flags(rec, nodemap);
3790 if (ret == MONITOR_ELECTION_NEEDED) {
3791 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3792 force_election(rec, pnn, nodemap);
3795 if (ret != MONITOR_OK) {
3796 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3800 if (ctdb->num_nodes != nodemap->num) {
3801 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3802 ctdb_load_nodes_file(ctdb);
3806 /* verify that all active nodes agree that we are the recmaster */
3807 switch (verify_recmaster(rec, nodemap, pnn)) {
3808 case MONITOR_RECOVERY_NEEDED:
3809 /* can not happen */
3811 case MONITOR_ELECTION_NEEDED:
3812 force_election(rec, pnn, nodemap);
3816 case MONITOR_FAILED:
3821 if (rec->need_recovery) {
3822 /* a previous recovery didn't finish */
3823 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3827 /* verify that all active nodes are in normal mode
3828 and not in recovery mode
3830 switch (verify_recmode(ctdb, nodemap)) {
3831 case MONITOR_RECOVERY_NEEDED:
3832 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3834 case MONITOR_FAILED:
3836 case MONITOR_ELECTION_NEEDED:
3837 /* can not happen */
3843 if (ctdb->recovery_lock_file != NULL) {
3844 /* We must already hold the recovery lock */
3845 if (!ctdb_recovery_have_lock(ctdb)) {
3846 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3847 ctdb_set_culprit(rec, ctdb->pnn);
3848 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3854 /* if there are takeovers requested, perform it and notify the waiters */
3855 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3856 rec->reallocate_requests) {
3857 process_ipreallocate_requests(ctdb, rec);
3860 /* If recoveries are disabled then there is no use doing any
3861 * nodemap or flags checks. Recoveries might be disabled due
3862 * to "reloadnodes", so doing these checks might cause an
3863 * unnecessary recovery. */
3864 if (ctdb_op_is_disabled(rec->recovery)) {
3868 /* get the nodemap for all active remote nodes
3870 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3871 if (remote_nodemaps == NULL) {
3872 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3875 for(i=0; i<nodemap->num; i++) {
3876 remote_nodemaps[i] = NULL;
3878 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3879 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3883 /* verify that all other nodes have the same nodemap as we have
3885 for (j=0; j<nodemap->num; j++) {
3886 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3890 if (remote_nodemaps[j] == NULL) {
3891 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3892 ctdb_set_culprit(rec, j);
3897 /* if the nodes disagree on how many nodes there are
3898 then this is a good reason to try recovery
3900 if (remote_nodemaps[j]->num != nodemap->num) {
3901 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3902 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3903 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3904 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3908 /* if the nodes disagree on which nodes exist and are
3909 active, then that is also a good reason to do recovery
3911 for (i=0;i<nodemap->num;i++) {
3912 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3913 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3914 nodemap->nodes[j].pnn, i,
3915 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3916 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3917 do_recovery(rec, mem_ctx, pnn, nodemap,
3925 * Update node flags obtained from each active node. This ensure we have
3926 * up-to-date information for all the nodes.
3928 for (j=0; j<nodemap->num; j++) {
3929 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3932 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3935 for (j=0; j<nodemap->num; j++) {
3936 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3940 /* verify the flags are consistent
3942 for (i=0; i<nodemap->num; i++) {
3943 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3947 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3948 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3949 nodemap->nodes[j].pnn,
3950 nodemap->nodes[i].pnn,
3951 remote_nodemaps[j]->nodes[i].flags,
3952 nodemap->nodes[i].flags));
3954 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3955 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3956 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3957 do_recovery(rec, mem_ctx, pnn, nodemap,
3961 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3962 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3963 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3964 do_recovery(rec, mem_ctx, pnn, nodemap,
3973 /* count how many active nodes there are */
3975 for (i=0; i<nodemap->num; i++) {
3976 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3977 if (ctdb_node_has_capabilities(rec->caps,
3978 ctdb->nodes[i]->pnn,
3979 CTDB_CAP_LMASTER)) {
3986 /* There must be the same number of lmasters in the vnn map as
3987 * there are active nodes with the lmaster capability... or
3990 if (vnnmap->size != num_lmasters) {
3991 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3992 vnnmap->size, num_lmasters));
3993 ctdb_set_culprit(rec, ctdb->pnn);
3994 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3998 /* verify that all active nodes in the nodemap also exist in
4001 for (j=0; j<nodemap->num; j++) {
4002 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
4005 if (nodemap->nodes[j].pnn == pnn) {
4009 for (i=0; i<vnnmap->size; i++) {
4010 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
4014 if (i == vnnmap->size) {
4015 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
4016 nodemap->nodes[j].pnn));
4017 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4018 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4024 /* verify that all other nodes have the same vnnmap
4025 and are from the same generation
4027 for (j=0; j<nodemap->num; j++) {
4028 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
4031 if (nodemap->nodes[j].pnn == pnn) {
4035 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
4036 mem_ctx, &remote_vnnmap);
4038 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
4039 nodemap->nodes[j].pnn));
4043 /* verify the vnnmap generation is the same */
4044 if (vnnmap->generation != remote_vnnmap->generation) {
4045 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4046 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4047 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4048 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4052 /* verify the vnnmap size is the same */
4053 if (vnnmap->size != remote_vnnmap->size) {
4054 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4055 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4056 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4057 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4061 /* verify the vnnmap is the same */
4062 for (i=0;i<vnnmap->size;i++) {
4063 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4064 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4065 nodemap->nodes[j].pnn));
4066 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4067 do_recovery(rec, mem_ctx, pnn, nodemap,
4074 /* we might need to change who has what IP assigned */
4075 if (rec->need_takeover_run) {
4076 uint32_t culprit = (uint32_t)-1;
4078 rec->need_takeover_run = false;
4080 /* update the list of public ips that a node can handle for
4083 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4085 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4087 rec->need_takeover_run = true;
4091 /* execute the "startrecovery" event script on all nodes */
4092 ret = run_startrecovery_eventscript(rec, nodemap);
4094 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4095 ctdb_set_culprit(rec, ctdb->pnn);
4096 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4100 /* If takeover run fails, then the offending nodes are
4101 * assigned ban culprit counts. And we re-try takeover.
4102 * If takeover run fails repeatedly, the node would get
4105 * If rec->need_takeover_run is not set to true at this
4106 * failure, monitoring is disabled cluster-wide (via
4107 * startrecovery eventscript) and will not get enabled.
4109 if (!do_takeover_run(rec, nodemap, true)) {
4113 /* execute the "recovered" event script on all nodes */
4114 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4116 // we cant check whether the event completed successfully
4117 // since this script WILL fail if the node is in recovery mode
4118 // and if that race happens, the code here would just cause a second
4119 // cascading recovery.
4121 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4122 ctdb_set_culprit(rec, ctdb->pnn);
4123 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4130 the main monitoring loop
4132 static void monitor_cluster(struct ctdb_context *ctdb)
4134 struct ctdb_recoverd *rec;
4136 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4138 rec = talloc_zero(ctdb, struct ctdb_recoverd);
4139 CTDB_NO_MEMORY_FATAL(ctdb, rec);
4143 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
4144 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
4146 rec->recovery = ctdb_op_init(rec, "recoveries");
4147 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
4149 rec->priority_time = timeval_current();
4151 /* register a message port for sending memory dumps */
4152 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4154 /* register a message port for recovery elections */
4155 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4157 /* when nodes are disabled/enabled */
4158 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4160 /* when we are asked to puch out a flag change */
4161 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4163 /* register a message port for vacuum fetch */
4164 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4166 /* register a message port for reloadnodes */
4167 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4169 /* register a message port for performing a takeover run */
4170 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4172 /* register a message port for disabling the ip check for a short while */
4173 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4175 /* register a message port for updating the recovery daemons node assignment for an ip */
4176 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4178 /* register a message port for forcing a rebalance of a node next
4180 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4182 /* Register a message port for disabling takeover runs */
4183 ctdb_client_set_message_handler(ctdb,
4184 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4185 disable_takeover_runs_handler, rec);
4187 /* Register a message port for disabling recoveries */
4188 ctdb_client_set_message_handler(ctdb,
4189 CTDB_SRVID_DISABLE_RECOVERIES,
4190 disable_recoveries_handler, rec);
4192 /* register a message port for detaching database */
4193 ctdb_client_set_message_handler(ctdb,
4194 CTDB_SRVID_DETACH_DATABASE,
4195 detach_database_handler, rec);
4198 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4199 struct timeval start;
4203 DEBUG(DEBUG_CRIT,(__location__
4204 " Failed to create temp context\n"));
4208 start = timeval_current();
4209 main_loop(ctdb, rec, mem_ctx);
4210 talloc_free(mem_ctx);
4212 /* we only check for recovery once every second */
4213 elapsed = timeval_elapsed(&start);
4214 if (elapsed < ctdb->tunable.recover_interval) {
4215 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4222 event handler for when the main ctdbd dies
4224 static void ctdb_recoverd_parent(struct tevent_context *ev,
4225 struct tevent_fd *fde,
4226 uint16_t flags, void *private_data)
4228 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4233 called regularly to verify that the recovery daemon is still running
4235 static void ctdb_check_recd(struct tevent_context *ev,
4236 struct tevent_timer *te,
4237 struct timeval yt, void *p)
4239 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4241 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4242 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4244 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
4245 ctdb_restart_recd, ctdb);
4250 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4251 timeval_current_ofs(30, 0),
4252 ctdb_check_recd, ctdb);
4255 static void recd_sig_child_handler(struct tevent_context *ev,
4256 struct tevent_signal *se, int signum,
4257 int count, void *dont_care,
4260 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4265 pid = waitpid(-1, &status, WNOHANG);
4267 if (errno != ECHILD) {
4268 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4273 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4279 startup the recovery daemon as a child of the main ctdb daemon
4281 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4284 struct tevent_signal *se;
4285 struct tevent_fd *fde;
4287 if (pipe(fd) != 0) {
4291 ctdb->recoverd_pid = ctdb_fork(ctdb);
4292 if (ctdb->recoverd_pid == -1) {
4296 if (ctdb->recoverd_pid != 0) {
4297 talloc_free(ctdb->recd_ctx);
4298 ctdb->recd_ctx = talloc_new(ctdb);
4299 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4302 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4303 timeval_current_ofs(30, 0),
4304 ctdb_check_recd, ctdb);
4310 srandom(getpid() ^ time(NULL));
4312 ctdb_set_process_name("ctdb_recovered");
4313 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4314 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4318 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4320 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4321 ctdb_recoverd_parent, &fd[0]);
4322 tevent_fd_set_auto_close(fde);
4324 /* set up a handler to pick up sigchld */
4325 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4326 recd_sig_child_handler, ctdb);
4328 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4332 monitor_cluster(ctdb);
4334 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4339 shutdown the recovery daemon
4341 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4343 if (ctdb->recoverd_pid == 0) {
4347 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4348 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4350 TALLOC_FREE(ctdb->recd_ctx);
4351 TALLOC_FREE(ctdb->recd_ping_count);
4354 static void ctdb_restart_recd(struct tevent_context *ev,
4355 struct tevent_timer *te,
4356 struct timeval t, void *private_data)
4358 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4360 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4361 ctdb_stop_recoverd(ctdb);
4362 ctdb_start_recoverd(ctdb);