4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
35 struct ctdb_recoverd *rec;
40 private state of recovery daemon
42 struct ctdb_recoverd {
43 struct ctdb_context *ctdb;
44 uint32_t last_culprit;
45 uint32_t culprit_counter;
46 struct timeval first_recover_time;
47 struct ban_state **banned_nodes;
48 struct timeval priority_time;
49 bool need_takeover_run;
52 struct timed_event *send_election_te;
53 struct timed_event *election_timeout;
54 struct vacuum_info *vacuum_info;
57 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
58 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
66 static void async_callback(struct ctdb_client_control_state *state)
68 struct async_data *data = talloc_get_type(state->async.private_data, struct async_data);
72 /* one more node has responded with recmode data */
75 /* if we failed to push the db, then return an error and let
76 the main loop try again.
78 if (state->state != CTDB_CONTROL_DONE) {
79 DEBUG(0,("Async operation failed with state %d\n", state->state));
84 state->async.fn = NULL;
86 ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
87 if ((ret != 0) || (res != 0)) {
88 DEBUG(0,("Async operation failed with ret=%d res=%d\n", ret, (int)res));
94 static void async_add(struct async_data *data, struct ctdb_client_control_state *state)
96 /* set up the callback functions */
97 state->async.fn = async_callback;
98 state->async.private_data = data;
100 /* one more control to wait for to complete */
105 /* wait for up to the maximum number of seconds allowed
106 or until all nodes we expect a response from has replied
108 static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
110 while (data->count > 0) {
111 event_loop_once(ctdb->ev);
113 if (data->fail_count != 0) {
114 DEBUG(0,("Async wait failed - fail_count=%u\n", data->fail_count));
124 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
126 struct ctdb_context *ctdb = rec->ctdb;
128 DEBUG(0,("Unbanning node %u\n", pnn));
130 if (!ctdb_validate_pnn(ctdb, pnn)) {
131 DEBUG(0,("Bad pnn %u in ctdb_unban_node\n", pnn));
135 /* If we are unbanning a different node then just pass the ban info on */
136 if (pnn != ctdb->pnn) {
140 DEBUG(0,("Unanning remote node %u. Passing the ban request on to the remote node.\n", pnn));
142 data.dptr = (uint8_t *)&pnn;
143 data.dsize = sizeof(uint32_t);
145 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_UNBAN_NODE, data);
147 DEBUG(0,("Failed to unban node %u\n", pnn));
154 /* make sure we remember we are no longer banned in case
155 there is an election */
156 rec->node_flags &= ~NODE_FLAGS_BANNED;
158 DEBUG(0,("Clearing ban flag on node %u\n", pnn));
159 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
161 if (rec->banned_nodes[pnn] == NULL) {
162 DEBUG(0,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
166 talloc_free(rec->banned_nodes[pnn]);
167 rec->banned_nodes[pnn] = NULL;
172 called when a ban has timed out
174 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
176 struct ban_state *state = talloc_get_type(p, struct ban_state);
177 struct ctdb_recoverd *rec = state->rec;
178 uint32_t pnn = state->banned_node;
180 DEBUG(0,("Ban timeout. Node %u is now unbanned\n", pnn));
181 ctdb_unban_node(rec, pnn);
185 ban a node for a period of time
187 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
189 struct ctdb_context *ctdb = rec->ctdb;
191 DEBUG(0,("Banning node %u for %u seconds\n", pnn, ban_time));
193 if (!ctdb_validate_pnn(ctdb, pnn)) {
194 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
198 if (0 == ctdb->tunable.enable_bans) {
199 DEBUG(0,("Bans are disabled - ignoring ban of node %u\n", pnn));
203 /* If we are banning a different node then just pass the ban info on */
204 if (pnn != ctdb->pnn) {
205 struct ctdb_ban_info b;
209 DEBUG(0,("Banning remote node %u for %u seconds. Passing the ban request on to the remote node.\n", pnn, ban_time));
212 b.ban_time = ban_time;
214 data.dptr = (uint8_t *)&b;
215 data.dsize = sizeof(b);
217 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_BAN_NODE, data);
219 DEBUG(0,("Failed to ban node %u\n", pnn));
226 DEBUG(0,("self ban - lowering our election priority\n"));
227 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
229 /* banning ourselves - lower our election priority */
230 rec->priority_time = timeval_current();
232 /* make sure we remember we are banned in case there is an
234 rec->node_flags |= NODE_FLAGS_BANNED;
236 if (rec->banned_nodes[pnn] != NULL) {
237 DEBUG(0,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
238 talloc_free(rec->banned_nodes[pnn]);
239 rec->banned_nodes[pnn] = NULL;
242 rec->banned_nodes[pnn] = talloc(rec->banned_nodes, struct ban_state);
243 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
245 rec->banned_nodes[pnn]->rec = rec;
246 rec->banned_nodes[pnn]->banned_node = pnn;
249 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
250 timeval_current_ofs(ban_time, 0),
251 ctdb_ban_timeout, rec->banned_nodes[pnn]);
255 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
259 perform a simple control on all active nodes. The control cannot return data
261 static int async_control_on_active_nodes(struct ctdb_context *ctdb, enum ctdb_controls opcode,
262 struct ctdb_node_map *nodemap, TDB_DATA data, bool include_self)
264 struct async_data *async_data;
265 struct ctdb_client_control_state *state;
267 struct timeval timeout = CONTROL_TIMEOUT();
269 async_data = talloc_zero(ctdb, struct async_data);
270 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
272 /* loop over all active nodes and send an async control to each of them */
273 for (j=0; j<nodemap->num; j++) {
274 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
277 if (nodemap->nodes[j].pnn == ctdb->pnn && !include_self) {
280 state = ctdb_control_send(ctdb, nodemap->nodes[j].pnn, 0, opcode,
281 0, data, async_data, NULL, &timeout, NULL);
283 DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
284 talloc_free(async_data);
288 async_add(async_data, state);
291 if (async_wait(ctdb, async_data) != 0) {
292 DEBUG(0,(__location__ " Failed async control %u\n", (unsigned)opcode));
293 talloc_free(async_data);
297 talloc_free(async_data);
304 change recovery mode on all nodes
306 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
310 /* freeze all nodes */
311 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
312 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_FREEZE,
313 nodemap, tdb_null, true) != 0) {
314 DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
320 data.dsize = sizeof(uint32_t);
321 data.dptr = (unsigned char *)&rec_mode;
323 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_SET_RECMODE,
324 nodemap, data, true) != 0) {
325 DEBUG(0, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
329 if (rec_mode == CTDB_RECOVERY_NORMAL) {
330 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_THAW,
331 nodemap, tdb_null, true) != 0) {
332 DEBUG(0, (__location__ " Unable to thaw nodes. Recovery failed.\n"));
341 change recovery master on all node
343 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
347 data.dsize = sizeof(uint32_t);
348 data.dptr = (unsigned char *)&pnn;
350 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_SET_RECMASTER,
351 nodemap, data, true) != 0) {
352 DEBUG(0, (__location__ " Unable to set recmaster. Recovery failed.\n"));
361 ensure all other nodes have attached to any databases that we have
363 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
364 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
367 struct ctdb_dbid_map *remote_dbmap;
369 /* verify that all other nodes have all our databases */
370 for (j=0; j<nodemap->num; j++) {
371 /* we dont need to ourself ourselves */
372 if (nodemap->nodes[j].pnn == pnn) {
375 /* dont check nodes that are unavailable */
376 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
380 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
381 mem_ctx, &remote_dbmap);
383 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
387 /* step through all local databases */
388 for (db=0; db<dbmap->num;db++) {
392 for (i=0;i<remote_dbmap->num;i++) {
393 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
397 /* the remote node already have this database */
398 if (i!=remote_dbmap->num) {
401 /* ok so we need to create this database */
402 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
405 DEBUG(0, (__location__ " Unable to get dbname from node %u\n", pnn));
408 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
409 mem_ctx, name, dbmap->dbs[db].persistent);
411 DEBUG(0, (__location__ " Unable to create remote db:%s\n", name));
422 ensure we are attached to any databases that anyone else is attached to
424 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
425 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
428 struct ctdb_dbid_map *remote_dbmap;
430 /* verify that we have all database any other node has */
431 for (j=0; j<nodemap->num; j++) {
432 /* we dont need to ourself ourselves */
433 if (nodemap->nodes[j].pnn == pnn) {
436 /* dont check nodes that are unavailable */
437 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
441 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
442 mem_ctx, &remote_dbmap);
444 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
448 /* step through all databases on the remote node */
449 for (db=0; db<remote_dbmap->num;db++) {
452 for (i=0;i<(*dbmap)->num;i++) {
453 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
457 /* we already have this db locally */
458 if (i!=(*dbmap)->num) {
461 /* ok so we need to create this database and
464 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
465 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
467 DEBUG(0, (__location__ " Unable to get dbname from node %u\n",
468 nodemap->nodes[j].pnn));
471 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
472 remote_dbmap->dbs[db].persistent);
474 DEBUG(0, (__location__ " Unable to create local db:%s\n", name));
477 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
479 DEBUG(0, (__location__ " Unable to reread dbmap on node %u\n", pnn));
490 pull the remote database contents from one node into the recdb
492 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
493 struct tdb_wrap *recdb, uint32_t dbid)
497 struct ctdb_control_pulldb_reply *reply;
498 struct ctdb_rec_data *rec;
500 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
502 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
503 CONTROL_TIMEOUT(), &outdata);
505 DEBUG(0,(__location__ " Unable to copy db from node %u\n", srcnode));
506 talloc_free(tmp_ctx);
510 reply = (struct ctdb_control_pulldb_reply *)outdata.dptr;
512 if (outdata.dsize < offsetof(struct ctdb_control_pulldb_reply, data)) {
513 DEBUG(0,(__location__ " invalid data in pulldb reply\n"));
514 talloc_free(tmp_ctx);
518 rec = (struct ctdb_rec_data *)&reply->data[0];
522 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
524 struct ctdb_ltdb_header *hdr;
527 key.dptr = &rec->data[0];
528 key.dsize = rec->keylen;
529 data.dptr = &rec->data[key.dsize];
530 data.dsize = rec->datalen;
532 hdr = (struct ctdb_ltdb_header *)data.dptr;
534 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
535 DEBUG(0,(__location__ " bad ltdb record\n"));
536 talloc_free(tmp_ctx);
540 /* fetch the existing record, if any */
541 existing = tdb_fetch(recdb->tdb, key);
543 if (existing.dptr != NULL) {
544 struct ctdb_ltdb_header header;
545 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
546 DEBUG(0,(__location__ " Bad record size %u from node %u\n",
547 (unsigned)existing.dsize, srcnode));
549 talloc_free(tmp_ctx);
552 header = *(struct ctdb_ltdb_header *)existing.dptr;
554 if (!(header.rsn < hdr->rsn ||
555 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
560 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
561 DEBUG(0,(__location__ " Failed to store record\n"));
562 talloc_free(tmp_ctx);
567 talloc_free(tmp_ctx);
573 pull all the remote database contents into the recdb
575 static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
576 struct tdb_wrap *recdb, uint32_t dbid)
580 /* pull all records from all other nodes across onto this node
581 (this merges based on rsn)
583 for (j=0; j<nodemap->num; j++) {
584 /* dont merge from nodes that are unavailable */
585 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
588 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
589 DEBUG(0,(__location__ " Failed to pull remote database from node %u\n",
590 nodemap->nodes[j].pnn));
600 update flags on all active nodes
602 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
605 for (i=0;i<nodemap->num;i++) {
606 struct ctdb_node_flag_change c;
609 c.pnn = nodemap->nodes[i].pnn;
610 c.old_flags = nodemap->nodes[i].flags;
611 c.new_flags = nodemap->nodes[i].flags;
613 data.dptr = (uint8_t *)&c;
614 data.dsize = sizeof(c);
616 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
617 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
625 ensure all nodes have the same vnnmap we do
627 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
628 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
632 /* push the new vnn map out to all the nodes */
633 for (j=0; j<nodemap->num; j++) {
634 /* dont push to nodes that are unavailable */
635 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
639 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
641 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
651 handler for when the admin bans a node
653 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
654 TDB_DATA data, void *private_data)
656 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
657 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
658 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
660 if (data.dsize != sizeof(*b)) {
661 DEBUG(0,("Bad data in ban_handler\n"));
662 talloc_free(mem_ctx);
666 if (b->pnn != ctdb->pnn) {
667 DEBUG(0,("Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request\n", b->pnn, ctdb->pnn));
671 DEBUG(0,("Node %u has been banned for %u seconds\n",
672 b->pnn, b->ban_time));
674 ctdb_ban_node(rec, b->pnn, b->ban_time);
675 talloc_free(mem_ctx);
679 handler for when the admin unbans a node
681 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
682 TDB_DATA data, void *private_data)
684 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
685 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
688 if (data.dsize != sizeof(uint32_t)) {
689 DEBUG(0,("Bad data in unban_handler\n"));
690 talloc_free(mem_ctx);
693 pnn = *(uint32_t *)data.dptr;
695 if (pnn != ctdb->pnn) {
696 DEBUG(0,("Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request\n", pnn, ctdb->pnn));
700 DEBUG(0,("Node %u has been unbanned.\n", pnn));
701 ctdb_unban_node(rec, pnn);
702 talloc_free(mem_ctx);
707 struct vacuum_info *next, *prev;
708 struct ctdb_recoverd *rec;
710 struct ctdb_db_context *ctdb_db;
711 struct ctdb_control_pulldb_reply *recs;
712 struct ctdb_rec_data *r;
715 static void vacuum_fetch_next(struct vacuum_info *v);
718 called when a vacuum fetch has completed - just free it and do the next one
720 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
722 struct vacuum_info *v = talloc_get_type(state->async.private, struct vacuum_info);
724 vacuum_fetch_next(v);
729 process the next element from the vacuum list
731 static void vacuum_fetch_next(struct vacuum_info *v)
733 struct ctdb_call call;
734 struct ctdb_rec_data *r;
736 while (v->recs->count) {
737 struct ctdb_client_call_state *state;
739 struct ctdb_ltdb_header *hdr;
742 call.call_id = CTDB_NULL_FUNC;
743 call.flags = CTDB_IMMEDIATE_MIGRATION;
746 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
749 call.key.dptr = &r->data[0];
750 call.key.dsize = r->keylen;
752 /* ensure we don't block this daemon - just skip a record if we can't get
754 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
758 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
759 if (data.dptr == NULL || data.dsize < sizeof(struct ctdb_ltdb_header)) {
760 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
764 hdr = (struct ctdb_ltdb_header *)data.dptr;
765 if (hdr->dmaster == v->rec->ctdb->pnn) {
766 /* its already local */
767 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
771 state = ctdb_call_send(v->ctdb_db, &call);
772 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
774 DEBUG(0,(__location__ " Failed to setup vacuum fetch call\n"));
778 state->async.fn = vacuum_fetch_callback;
779 state->async.private = v;
788 destroy a vacuum info structure
790 static int vacuum_info_destructor(struct vacuum_info *v)
792 DLIST_REMOVE(v->rec->vacuum_info, v);
798 handler for vacuum fetch
800 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
801 TDB_DATA data, void *private_data)
803 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
804 struct ctdb_control_pulldb_reply *recs;
806 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
808 struct ctdb_dbid_map *dbmap=NULL;
809 bool persistent = false;
810 struct ctdb_db_context *ctdb_db;
811 struct ctdb_rec_data *r;
813 struct vacuum_info *v;
815 recs = (struct ctdb_control_pulldb_reply *)data.dptr;
816 r = (struct ctdb_rec_data *)&recs->data[0];
818 if (recs->count == 0) {
824 for (v=rec->vacuum_info;v;v=v->next) {
825 if (srcnode == v->srcnode) {
826 /* we're already working on records from this node */
831 /* work out if the database is persistent */
832 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
834 DEBUG(0, (__location__ " Unable to get dbids from local node\n"));
835 talloc_free(tmp_ctx);
839 for (i=0;i<dbmap->num;i++) {
840 if (dbmap->dbs[i].dbid == recs->db_id) {
841 persistent = dbmap->dbs[i].persistent;
845 if (i == dbmap->num) {
846 DEBUG(0, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
847 talloc_free(tmp_ctx);
851 /* find the name of this database */
852 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
853 DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
854 talloc_free(tmp_ctx);
859 ctdb_db = ctdb_attach(ctdb, name, persistent);
860 if (ctdb_db == NULL) {
861 DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
862 talloc_free(tmp_ctx);
866 v = talloc_zero(rec, struct vacuum_info);
868 DEBUG(0,(__location__ " Out of memory\n"));
873 v->srcnode = srcnode;
874 v->ctdb_db = ctdb_db;
875 v->recs = talloc_memdup(v, recs, data.dsize);
876 if (v->recs == NULL) {
877 DEBUG(0,(__location__ " Out of memory\n"));
881 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
883 DLIST_ADD(rec->vacuum_info, v);
885 talloc_set_destructor(v, vacuum_info_destructor);
887 vacuum_fetch_next(v);
892 called when ctdb_wait_timeout should finish
894 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
895 struct timeval yt, void *p)
897 uint32_t *timed_out = (uint32_t *)p;
902 wait for a given number of seconds
904 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
906 uint32_t timed_out = 0;
907 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
909 event_loop_once(ctdb->ev);
914 called when an election times out (ends)
916 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
917 struct timeval t, void *p)
919 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
920 rec->election_timeout = NULL;
925 wait for an election to finish. It finished election_timeout seconds after
926 the last election packet is received
928 static void ctdb_wait_election(struct ctdb_recoverd *rec)
930 struct ctdb_context *ctdb = rec->ctdb;
931 while (rec->election_timeout) {
932 event_loop_once(ctdb->ev);
937 remember the trouble maker
939 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
941 struct ctdb_context *ctdb = rec->ctdb;
943 if (rec->last_culprit != culprit ||
944 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
945 DEBUG(0,("New recovery culprit %u\n", culprit));
946 /* either a new node is the culprit, or we've decided to forgive them */
947 rec->last_culprit = culprit;
948 rec->first_recover_time = timeval_current();
949 rec->culprit_counter = 0;
951 rec->culprit_counter++;
955 Update our local flags from all remote connected nodes.
956 This is only run when we are or we belive we are the recovery master
958 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
961 struct ctdb_context *ctdb = rec->ctdb;
962 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
964 /* get the nodemap for all active remote nodes and verify
965 they are the same as for this node
967 for (j=0; j<nodemap->num; j++) {
968 struct ctdb_node_map *remote_nodemap=NULL;
971 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
974 if (nodemap->nodes[j].pnn == ctdb->pnn) {
978 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
979 mem_ctx, &remote_nodemap);
981 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
982 nodemap->nodes[j].pnn));
983 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
984 talloc_free(mem_ctx);
985 return MONITOR_FAILED;
987 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
988 struct ctdb_node_flag_change c;
991 /* We should tell our daemon about this so it
992 updates its flags or else we will log the same
993 message again in the next iteration of recovery.
994 Since we are the recovery master we can just as
995 well update the flags on all nodes.
997 c.pnn = nodemap->nodes[j].pnn;
998 c.old_flags = nodemap->nodes[j].flags;
999 c.new_flags = remote_nodemap->nodes[j].flags;
1001 data.dptr = (uint8_t *)&c;
1002 data.dsize = sizeof(c);
1004 ctdb_send_message(ctdb, ctdb->pnn,
1005 CTDB_SRVID_NODE_FLAGS_CHANGED,
1008 /* Update our local copy of the flags in the recovery
1011 DEBUG(0,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1012 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1013 nodemap->nodes[j].flags));
1014 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1016 /* If the BANNED flag has changed for the node
1017 this is a good reason to do a new election.
1019 if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) {
1020 DEBUG(0,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
1021 nodemap->nodes[j].pnn, c.new_flags,
1023 talloc_free(mem_ctx);
1024 return MONITOR_ELECTION_NEEDED;
1028 talloc_free(remote_nodemap);
1030 talloc_free(mem_ctx);
1035 /* Create a new random generation ip.
1036 The generation id can not be the INVALID_GENERATION id
1038 static uint32_t new_generation(void)
1040 uint32_t generation;
1043 generation = random();
1045 if (generation != INVALID_GENERATION) {
1055 create a temporary working database
1057 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1060 struct tdb_wrap *recdb;
1062 /* open up the temporary recovery database */
1063 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1068 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1069 TDB_NOLOCK, O_RDWR|O_CREAT|O_EXCL, 0600);
1070 if (recdb == NULL) {
1071 DEBUG(0,(__location__ " Failed to create temp recovery database '%s'\n", name));
1081 a traverse function for pulling all relevent records from recdb
1084 struct ctdb_context *ctdb;
1085 struct ctdb_control_pulldb_reply *recdata;
1090 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1092 struct recdb_data *params = (struct recdb_data *)p;
1093 struct ctdb_rec_data *rec;
1094 struct ctdb_ltdb_header *hdr;
1096 /* skip empty records */
1097 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1101 /* update the dmaster field to point to us */
1102 hdr = (struct ctdb_ltdb_header *)data.dptr;
1103 hdr->dmaster = params->ctdb->pnn;
1105 /* add the record to the blob ready to send to the nodes */
1106 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1108 params->failed = true;
1111 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1112 if (params->recdata == NULL) {
1113 DEBUG(0,(__location__ " Failed to expand recdata to %u (%u records)\n",
1114 rec->length + params->len, params->recdata->count));
1115 params->failed = true;
1118 params->recdata->count++;
1119 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1120 params->len += rec->length;
1127 push the recdb database out to all nodes
1129 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1130 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1132 struct recdb_data params;
1133 struct ctdb_control_pulldb_reply *recdata;
1136 recdata = talloc_zero(recdb, struct ctdb_control_pulldb_reply);
1137 CTDB_NO_MEMORY(ctdb, recdata);
1139 recdata->db_id = dbid;
1142 params.recdata = recdata;
1143 params.len = offsetof(struct ctdb_control_pulldb_reply, data);
1144 params.failed = false;
1146 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1147 DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
1148 talloc_free(params.recdata);
1152 if (params.failed) {
1153 DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
1154 talloc_free(params.recdata);
1158 recdata = params.recdata;
1160 outdata.dptr = (void *)recdata;
1161 outdata.dsize = params.len;
1163 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_PUSH_DB, nodemap, outdata, true) != 0) {
1164 DEBUG(0,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1165 talloc_free(recdata);
1169 DEBUG(0, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1170 dbid, recdata->count));
1172 talloc_free(recdata);
1179 go through a full recovery on one database
1181 static int recover_database(struct ctdb_recoverd *rec,
1182 TALLOC_CTX *mem_ctx,
1185 struct ctdb_node_map *nodemap,
1186 uint32_t transaction_id)
1188 struct tdb_wrap *recdb;
1190 struct ctdb_context *ctdb = rec->ctdb;
1192 struct ctdb_control_wipe_database w;
1194 recdb = create_recdb(ctdb, mem_ctx);
1195 if (recdb == NULL) {
1199 /* pull all remote databases onto the recdb */
1200 ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
1202 DEBUG(0, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1206 DEBUG(0, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1208 /* wipe all the remote databases. This is safe as we are in a transaction */
1210 w.transaction_id = transaction_id;
1212 data.dptr = (void *)&w;
1213 data.dsize = sizeof(w);
1215 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1216 nodemap, data, true) != 0) {
1217 DEBUG(0, (__location__ " Unable to wipe database. Recovery failed.\n"));
1221 /* push out the correct database. This sets the dmaster and skips
1222 the empty records */
1223 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1229 /* all done with this database */
1237 we are the recmaster, and recovery is needed - start a recovery run
1239 static int do_recovery(struct ctdb_recoverd *rec,
1240 TALLOC_CTX *mem_ctx, uint32_t pnn, uint32_t num_active,
1241 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
1244 struct ctdb_context *ctdb = rec->ctdb;
1246 uint32_t generation;
1247 struct ctdb_dbid_map *dbmap;
1250 DEBUG(0, (__location__ " Starting do_recovery\n"));
1252 /* if recovery fails, force it again */
1253 rec->need_recovery = true;
1255 ctdb_set_culprit(rec, culprit);
1257 if (rec->culprit_counter > 2*nodemap->num) {
1258 DEBUG(0,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
1259 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
1260 ctdb->tunable.recovery_ban_period));
1261 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
1264 if (!ctdb_recovery_lock(ctdb, true)) {
1265 ctdb_set_culprit(rec, pnn);
1266 DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
1270 DEBUG(0, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
1272 /* get a list of all databases */
1273 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1275 DEBUG(0, (__location__ " Unable to get dbids from node :%u\n", pnn));
1279 /* we do the db creation before we set the recovery mode, so the freeze happens
1280 on all databases we will be dealing with. */
1282 /* verify that we have all the databases any other node has */
1283 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1285 DEBUG(0, (__location__ " Unable to create missing local databases\n"));
1289 /* verify that all other nodes have all our databases */
1290 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1292 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
1296 DEBUG(0, (__location__ " Recovery - created remote databases\n"));
1298 /* set recovery mode to active on all nodes */
1299 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1301 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1305 /* pick a new generation number */
1306 generation = new_generation();
1308 /* change the vnnmap on this node to use the new generation
1309 number but not on any other nodes.
1310 this guarantees that if we abort the recovery prematurely
1311 for some reason (a node stops responding?)
1312 that we can just return immediately and we will reenter
1313 recovery shortly again.
1314 I.e. we deliberately leave the cluster with an inconsistent
1315 generation id to allow us to abort recovery at any stage and
1316 just restart it from scratch.
1318 vnnmap->generation = generation;
1319 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1321 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1325 data.dptr = (void *)&generation;
1326 data.dsize = sizeof(uint32_t);
1328 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_TRANSACTION_START,
1329 nodemap, data, true) != 0) {
1330 DEBUG(0, (__location__ " Unable to start transactions. Recovery failed.\n"));
1334 DEBUG(0,(__location__ " started transactions on all nodes\n"));
1336 for (i=0;i<dbmap->num;i++) {
1337 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1338 DEBUG(0, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1343 DEBUG(0, (__location__ " Recovery - starting database commits\n"));
1345 /* commit all the changes */
1346 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1347 nodemap, data, true) != 0) {
1348 DEBUG(0, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1352 DEBUG(0, (__location__ " Recovery - committed databases\n"));
1355 /* build a new vnn map with all the currently active and
1357 generation = new_generation();
1358 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1359 CTDB_NO_MEMORY(ctdb, vnnmap);
1360 vnnmap->generation = generation;
1361 vnnmap->size = num_active;
1362 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1363 for (i=j=0;i<nodemap->num;i++) {
1364 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1365 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1369 /* update to the new vnnmap on all nodes */
1370 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1372 DEBUG(0, (__location__ " Unable to update vnnmap on all nodes\n"));
1376 DEBUG(0, (__location__ " Recovery - updated vnnmap\n"));
1378 /* update recmaster to point to us for all nodes */
1379 ret = set_recovery_master(ctdb, nodemap, pnn);
1381 DEBUG(0, (__location__ " Unable to set recovery master\n"));
1385 DEBUG(0, (__location__ " Recovery - updated recmaster\n"));
1388 update all nodes to have the same flags that we have
1390 ret = update_flags_on_all_nodes(ctdb, nodemap);
1392 DEBUG(0, (__location__ " Unable to update flags on all nodes\n"));
1396 DEBUG(0, (__location__ " Recovery - updated flags\n"));
1399 if enabled, tell nodes to takeover their public IPs
1402 rec->need_takeover_run = false;
1403 ret = ctdb_takeover_run(ctdb, nodemap);
1405 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
1408 DEBUG(1, (__location__ " Recovery - done takeover\n"));
1411 /* disable recovery mode */
1412 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
1414 DEBUG(0, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1418 /* send a message to all clients telling them that the cluster
1419 has been reconfigured */
1420 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1422 DEBUG(0, (__location__ " Recovery complete\n"));
1424 rec->need_recovery = false;
1426 /* We just finished a recovery successfully.
1427 We now wait for rerecovery_timeout before we allow
1428 another recovery to take place.
1430 DEBUG(0, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1431 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1432 DEBUG(0, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1439 elections are won by first checking the number of connected nodes, then
1440 the priority time, then the pnn
1442 struct election_message {
1443 uint32_t num_connected;
1444 struct timeval priority_time;
1446 uint32_t node_flags;
1450 form this nodes election data
1452 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1455 struct ctdb_node_map *nodemap;
1456 struct ctdb_context *ctdb = rec->ctdb;
1460 em->pnn = rec->ctdb->pnn;
1461 em->priority_time = rec->priority_time;
1462 em->node_flags = rec->node_flags;
1464 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1466 DEBUG(0,(__location__ " unable to get election data\n"));
1470 for (i=0;i<nodemap->num;i++) {
1471 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1472 em->num_connected++;
1475 talloc_free(nodemap);
1479 see if the given election data wins
1481 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1483 struct election_message myem;
1486 ctdb_election_data(rec, &myem);
1488 /* we cant win if we are banned */
1489 if (rec->node_flags & NODE_FLAGS_BANNED) {
1493 /* we will automatically win if the other node is banned */
1494 if (em->node_flags & NODE_FLAGS_BANNED) {
1498 /* try to use the most connected node */
1500 cmp = (int)myem.num_connected - (int)em->num_connected;
1503 /* then the longest running node */
1505 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1509 cmp = (int)myem.pnn - (int)em->pnn;
1516 send out an election request
1518 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1521 TDB_DATA election_data;
1522 struct election_message emsg;
1524 struct ctdb_context *ctdb = rec->ctdb;
1526 srvid = CTDB_SRVID_RECOVERY;
1528 ctdb_election_data(rec, &emsg);
1530 election_data.dsize = sizeof(struct election_message);
1531 election_data.dptr = (unsigned char *)&emsg;
1534 /* first we assume we will win the election and set
1535 recoverymaster to be ourself on the current node
1537 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1539 DEBUG(0, (__location__ " failed to send recmaster election request\n"));
1544 /* send an election message to all active nodes */
1545 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1551 this function will unban all nodes in the cluster
1553 static void unban_all_nodes(struct ctdb_context *ctdb)
1556 struct ctdb_node_map *nodemap;
1557 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1559 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1561 DEBUG(0,(__location__ " failed to get nodemap to unban all nodes\n"));
1565 for (i=0;i<nodemap->num;i++) {
1566 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1567 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1568 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1572 talloc_free(tmp_ctx);
1577 we think we are winning the election - send a broadcast election request
1579 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1581 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1584 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1586 DEBUG(0,("Failed to send election request!\n"));
1589 talloc_free(rec->send_election_te);
1590 rec->send_election_te = NULL;
1594 handler for recovery master elections
1596 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1597 TDB_DATA data, void *private_data)
1599 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1601 struct election_message *em = (struct election_message *)data.dptr;
1602 TALLOC_CTX *mem_ctx;
1604 /* we got an election packet - update the timeout for the election */
1605 talloc_free(rec->election_timeout);
1606 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1607 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1608 ctdb_election_timeout, rec);
1610 mem_ctx = talloc_new(ctdb);
1612 /* someone called an election. check their election data
1613 and if we disagree and we would rather be the elected node,
1614 send a new election message to all other nodes
1616 if (ctdb_election_win(rec, em)) {
1617 if (!rec->send_election_te) {
1618 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1619 timeval_current_ofs(0, 500000),
1620 election_send_request, rec);
1622 talloc_free(mem_ctx);
1623 /*unban_all_nodes(ctdb);*/
1628 talloc_free(rec->send_election_te);
1629 rec->send_election_te = NULL;
1631 /* release the recmaster lock */
1632 if (em->pnn != ctdb->pnn &&
1633 ctdb->recovery_lock_fd != -1) {
1634 close(ctdb->recovery_lock_fd);
1635 ctdb->recovery_lock_fd = -1;
1636 unban_all_nodes(ctdb);
1639 /* ok, let that guy become recmaster then */
1640 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1642 DEBUG(0, (__location__ " failed to send recmaster election request"));
1643 talloc_free(mem_ctx);
1647 /* release any bans */
1648 rec->last_culprit = (uint32_t)-1;
1649 talloc_free(rec->banned_nodes);
1650 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1651 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1653 talloc_free(mem_ctx);
1659 force the start of the election process
1661 static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn,
1662 struct ctdb_node_map *nodemap)
1665 struct ctdb_context *ctdb = rec->ctdb;
1667 /* set all nodes to recovery mode to stop all internode traffic */
1668 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1670 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1674 talloc_free(rec->election_timeout);
1675 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1676 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1677 ctdb_election_timeout, rec);
1679 ret = send_election_request(rec, pnn);
1681 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1685 /* wait for a few seconds to collect all responses */
1686 ctdb_wait_election(rec);
1692 handler for when a node changes its flags
1694 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1695 TDB_DATA data, void *private_data)
1698 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1699 struct ctdb_node_map *nodemap=NULL;
1700 TALLOC_CTX *tmp_ctx;
1701 uint32_t changed_flags;
1703 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1705 if (data.dsize != sizeof(*c)) {
1706 DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1710 tmp_ctx = talloc_new(ctdb);
1711 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1713 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1715 DEBUG(0,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1716 talloc_free(tmp_ctx);
1721 for (i=0;i<nodemap->num;i++) {
1722 if (nodemap->nodes[i].pnn == c->pnn) break;
1725 if (i == nodemap->num) {
1726 DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1727 talloc_free(tmp_ctx);
1731 changed_flags = c->old_flags ^ c->new_flags;
1733 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1734 This flag is handled locally based on whether the local node
1735 can communicate with the node or not.
1737 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1738 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1739 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1742 if (nodemap->nodes[i].flags != c->new_flags) {
1743 DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1746 nodemap->nodes[i].flags = c->new_flags;
1748 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1749 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1752 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1753 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1757 ctdb->recovery_master == ctdb->pnn &&
1758 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
1760 /* Only do the takeover run if the perm disabled or unhealthy
1761 flags changed since these will cause an ip failover but not
1763 If the node became disconnected or banned this will also
1764 lead to an ip address failover but that is handled
1767 if (changed_flags & NODE_FLAGS_DISABLED) {
1768 rec->need_takeover_run = true;
1772 talloc_free(tmp_ctx);
1777 struct verify_recmode_normal_data {
1779 enum monitor_result status;
1782 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1784 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1787 /* one more node has responded with recmode data*/
1790 /* if we failed to get the recmode, then return an error and let
1791 the main loop try again.
1793 if (state->state != CTDB_CONTROL_DONE) {
1794 if (rmdata->status == MONITOR_OK) {
1795 rmdata->status = MONITOR_FAILED;
1800 /* if we got a response, then the recmode will be stored in the
1803 if (state->status != CTDB_RECOVERY_NORMAL) {
1804 DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1805 rmdata->status = MONITOR_RECOVERY_NEEDED;
1812 /* verify that all nodes are in normal recovery mode */
1813 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1815 struct verify_recmode_normal_data *rmdata;
1816 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1817 struct ctdb_client_control_state *state;
1818 enum monitor_result status;
1821 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1822 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1824 rmdata->status = MONITOR_OK;
1826 /* loop over all active nodes and send an async getrecmode call to
1828 for (j=0; j<nodemap->num; j++) {
1829 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1832 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1834 nodemap->nodes[j].pnn);
1835 if (state == NULL) {
1836 /* we failed to send the control, treat this as
1837 an error and try again next iteration
1839 DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1840 talloc_free(mem_ctx);
1841 return MONITOR_FAILED;
1844 /* set up the callback functions */
1845 state->async.fn = verify_recmode_normal_callback;
1846 state->async.private_data = rmdata;
1848 /* one more control to wait for to complete */
1853 /* now wait for up to the maximum number of seconds allowed
1854 or until all nodes we expect a response from has replied
1856 while (rmdata->count > 0) {
1857 event_loop_once(ctdb->ev);
1860 status = rmdata->status;
1861 talloc_free(mem_ctx);
1866 struct verify_recmaster_data {
1869 enum monitor_result status;
1872 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1874 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1877 /* one more node has responded with recmaster data*/
1880 /* if we failed to get the recmaster, then return an error and let
1881 the main loop try again.
1883 if (state->state != CTDB_CONTROL_DONE) {
1884 if (rmdata->status == MONITOR_OK) {
1885 rmdata->status = MONITOR_FAILED;
1890 /* if we got a response, then the recmaster will be stored in the
1893 if (state->status != rmdata->pnn) {
1894 DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
1895 rmdata->status = MONITOR_ELECTION_NEEDED;
1902 /* verify that all nodes agree that we are the recmaster */
1903 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
1905 struct verify_recmaster_data *rmdata;
1906 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1907 struct ctdb_client_control_state *state;
1908 enum monitor_result status;
1911 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1912 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1915 rmdata->status = MONITOR_OK;
1917 /* loop over all active nodes and send an async getrecmaster call to
1919 for (j=0; j<nodemap->num; j++) {
1920 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1923 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1925 nodemap->nodes[j].pnn);
1926 if (state == NULL) {
1927 /* we failed to send the control, treat this as
1928 an error and try again next iteration
1930 DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1931 talloc_free(mem_ctx);
1932 return MONITOR_FAILED;
1935 /* set up the callback functions */
1936 state->async.fn = verify_recmaster_callback;
1937 state->async.private_data = rmdata;
1939 /* one more control to wait for to complete */
1944 /* now wait for up to the maximum number of seconds allowed
1945 or until all nodes we expect a response from has replied
1947 while (rmdata->count > 0) {
1948 event_loop_once(ctdb->ev);
1951 status = rmdata->status;
1952 talloc_free(mem_ctx);
1958 the main monitoring loop
1960 static void monitor_cluster(struct ctdb_context *ctdb)
1962 uint32_t pnn, num_active, recmaster;
1963 TALLOC_CTX *mem_ctx=NULL;
1964 struct ctdb_node_map *nodemap=NULL;
1965 struct ctdb_node_map *remote_nodemap=NULL;
1966 struct ctdb_vnn_map *vnnmap=NULL;
1967 struct ctdb_vnn_map *remote_vnnmap=NULL;
1969 struct ctdb_recoverd *rec;
1970 struct ctdb_all_public_ips *ips;
1973 DEBUG(0,("monitor_cluster starting\n"));
1975 rec = talloc_zero(ctdb, struct ctdb_recoverd);
1976 CTDB_NO_MEMORY_FATAL(ctdb, rec);
1979 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1980 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1982 rec->priority_time = timeval_current();
1984 /* register a message port for recovery elections */
1985 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
1987 /* and one for when nodes are disabled/enabled */
1988 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
1990 /* and one for when nodes are banned */
1991 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
1993 /* and one for when nodes are unbanned */
1994 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
1996 /* register a message port for vacuum fetch */
1997 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2001 talloc_free(mem_ctx);
2004 mem_ctx = talloc_new(ctdb);
2006 DEBUG(0,("Failed to create temporary context\n"));
2010 /* we only check for recovery once every second */
2011 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2013 /* verify that the main daemon is still running */
2014 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2015 DEBUG(0,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2019 if (rec->election_timeout) {
2020 /* an election is in progress */
2025 /* We must check if we need to ban a node here but we want to do this
2026 as early as possible so we dont wait until we have pulled the node
2027 map from the local node. thats why we have the hardcoded value 20
2029 if (rec->culprit_counter > 20) {
2030 DEBUG(0,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
2031 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
2032 ctdb->tunable.recovery_ban_period));
2033 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
2036 /* get relevant tunables */
2037 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2039 DEBUG(0,("Failed to get tunables - retrying\n"));
2043 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2044 if (pnn == (uint32_t)-1) {
2045 DEBUG(0,("Failed to get local pnn - retrying\n"));
2049 /* get the vnnmap */
2050 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2052 DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2057 /* get number of nodes */
2058 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &nodemap);
2060 DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", pnn));
2064 /* check which node is the recovery master */
2065 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &recmaster);
2067 DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", pnn));
2071 if (recmaster == (uint32_t)-1) {
2072 DEBUG(0,(__location__ " Initial recovery master set - forcing election\n"));
2073 force_election(rec, mem_ctx, pnn, nodemap);
2077 /* check that we (recovery daemon) and the local ctdb daemon
2078 agrees on whether we are banned or not
2080 if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
2081 if (rec->banned_nodes[pnn] == NULL) {
2082 if (recmaster == pnn) {
2083 DEBUG(0,("Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2085 ctdb_unban_node(rec, pnn);
2087 DEBUG(0,("Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2088 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2089 ctdb_set_culprit(rec, pnn);
2094 if (rec->banned_nodes[pnn] != NULL) {
2095 if (recmaster == pnn) {
2096 DEBUG(0,("Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2098 ctdb_unban_node(rec, pnn);
2100 DEBUG(0,("Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2102 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2103 ctdb_set_culprit(rec, pnn);
2109 /* remember our own node flags */
2110 rec->node_flags = nodemap->nodes[pnn].flags;
2112 /* count how many active nodes there are */
2114 for (i=0; i<nodemap->num; i++) {
2115 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2121 /* verify that the recmaster node is still active */
2122 for (j=0; j<nodemap->num; j++) {
2123 if (nodemap->nodes[j].pnn==recmaster) {
2128 if (j == nodemap->num) {
2129 DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster));
2130 force_election(rec, mem_ctx, pnn, nodemap);
2134 /* if recovery master is disconnected we must elect a new recmaster */
2135 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2136 DEBUG(0, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2137 force_election(rec, mem_ctx, pnn, nodemap);
2141 /* grap the nodemap from the recovery master to check if it is banned */
2142 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2143 mem_ctx, &remote_nodemap);
2145 DEBUG(0, (__location__ " Unable to get nodemap from recovery master %u\n",
2146 nodemap->nodes[j].pnn));
2151 if (remote_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2152 DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2153 force_election(rec, mem_ctx, pnn, nodemap);
2157 /* verify that the public ip address allocation is consistent */
2158 if (ctdb->vnn != NULL) {
2159 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2161 DEBUG(0, ("Unable to get public ips from node %u\n", i));
2164 for (j=0; j<ips->num; j++) {
2165 /* verify that we have the ip addresses we should have
2166 and we dont have ones we shouldnt have.
2167 if we find an inconsistency we set recmode to
2168 active on the local node and wait for the recmaster
2169 to do a full blown recovery
2171 if (ips->ips[j].pnn == pnn) {
2172 if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
2173 DEBUG(0,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
2174 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2176 DEBUG(0,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2179 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2181 DEBUG(0,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2186 if (ctdb_sys_have_ip(ips->ips[j].sin)) {
2187 DEBUG(0,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
2188 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2190 DEBUG(0,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2193 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2195 DEBUG(0,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2203 /* if we are not the recmaster then we do not need to check
2204 if recovery is needed
2206 if (pnn != recmaster) {
2211 /* ensure our local copies of flags are right */
2212 ret = update_local_flags(rec, nodemap);
2213 if (ret == MONITOR_ELECTION_NEEDED) {
2214 DEBUG(0,("update_local_flags() called for a re-election.\n"));
2215 force_election(rec, mem_ctx, pnn, nodemap);
2218 if (ret != MONITOR_OK) {
2219 DEBUG(0,("Unable to update local flags\n"));
2223 /* update the list of public ips that a node can handle for
2226 for (j=0; j<nodemap->num; j++) {
2227 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2230 /* release any existing data */
2231 if (ctdb->nodes[j]->public_ips) {
2232 talloc_free(ctdb->nodes[j]->public_ips);
2233 ctdb->nodes[j]->public_ips = NULL;
2235 /* grab a new shiny list of public ips from the node */
2236 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2237 ctdb->nodes[j]->pnn,
2239 &ctdb->nodes[j]->public_ips)) {
2240 DEBUG(0,("Failed to read public ips from node : %u\n",
2241 ctdb->nodes[j]->pnn));
2247 /* verify that all active nodes agree that we are the recmaster */
2248 switch (verify_recmaster(ctdb, nodemap, pnn)) {
2249 case MONITOR_RECOVERY_NEEDED:
2250 /* can not happen */
2252 case MONITOR_ELECTION_NEEDED:
2253 force_election(rec, mem_ctx, pnn, nodemap);
2257 case MONITOR_FAILED:
2262 if (rec->need_recovery) {
2263 /* a previous recovery didn't finish */
2264 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2268 /* verify that all active nodes are in normal mode
2269 and not in recovery mode
2271 switch (verify_recmode(ctdb, nodemap)) {
2272 case MONITOR_RECOVERY_NEEDED:
2273 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2275 case MONITOR_FAILED:
2277 case MONITOR_ELECTION_NEEDED:
2278 /* can not happen */
2284 /* we should have the reclock - check its not stale */
2285 if (ctdb->recovery_lock_fd == -1) {
2286 DEBUG(0,("recovery master doesn't have the recovery lock\n"));
2287 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2291 if (read(ctdb->recovery_lock_fd, &c, 1) == -1) {
2292 DEBUG(0,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2293 close(ctdb->recovery_lock_fd);
2294 ctdb->recovery_lock_fd = -1;
2295 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2299 /* get the nodemap for all active remote nodes and verify
2300 they are the same as for this node
2302 for (j=0; j<nodemap->num; j++) {
2303 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2306 if (nodemap->nodes[j].pnn == pnn) {
2310 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2311 mem_ctx, &remote_nodemap);
2313 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
2314 nodemap->nodes[j].pnn));
2318 /* if the nodes disagree on how many nodes there are
2319 then this is a good reason to try recovery
2321 if (remote_nodemap->num != nodemap->num) {
2322 DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2323 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
2324 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2328 /* if the nodes disagree on which nodes exist and are
2329 active, then that is also a good reason to do recovery
2331 for (i=0;i<nodemap->num;i++) {
2332 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
2333 DEBUG(0, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2334 nodemap->nodes[j].pnn, i,
2335 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
2336 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2337 vnnmap, nodemap->nodes[j].pnn);
2340 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
2341 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2342 DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
2343 nodemap->nodes[j].pnn, i,
2344 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
2345 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2346 vnnmap, nodemap->nodes[j].pnn);
2354 /* there better be the same number of lmasters in the vnn map
2355 as there are active nodes or we will have to do a recovery
2357 if (vnnmap->size != num_active) {
2358 DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
2359 vnnmap->size, num_active));
2360 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2364 /* verify that all active nodes in the nodemap also exist in
2367 for (j=0; j<nodemap->num; j++) {
2368 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2371 if (nodemap->nodes[j].pnn == pnn) {
2375 for (i=0; i<vnnmap->size; i++) {
2376 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2380 if (i == vnnmap->size) {
2381 DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2382 nodemap->nodes[j].pnn));
2383 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2389 /* verify that all other nodes have the same vnnmap
2390 and are from the same generation
2392 for (j=0; j<nodemap->num; j++) {
2393 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2396 if (nodemap->nodes[j].pnn == pnn) {
2400 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2401 mem_ctx, &remote_vnnmap);
2403 DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n",
2404 nodemap->nodes[j].pnn));
2408 /* verify the vnnmap generation is the same */
2409 if (vnnmap->generation != remote_vnnmap->generation) {
2410 DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2411 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2412 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2416 /* verify the vnnmap size is the same */
2417 if (vnnmap->size != remote_vnnmap->size) {
2418 DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2419 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2420 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2424 /* verify the vnnmap is the same */
2425 for (i=0;i<vnnmap->size;i++) {
2426 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2427 DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n",
2428 nodemap->nodes[j].pnn));
2429 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2430 vnnmap, nodemap->nodes[j].pnn);
2436 /* we might need to change who has what IP assigned */
2437 if (rec->need_takeover_run) {
2438 rec->need_takeover_run = false;
2439 ret = ctdb_takeover_run(ctdb, nodemap);
2441 DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
2442 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2452 event handler for when the main ctdbd dies
2454 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
2455 uint16_t flags, void *private_data)
2457 DEBUG(0,("recovery daemon parent died - exiting\n"));
2462 startup the recovery daemon as a child of the main ctdb daemon
2464 int ctdb_start_recoverd(struct ctdb_context *ctdb)
2469 if (pipe(fd) != 0) {
2473 ctdb->ctdbd_pid = getpid();
2475 ctdb->recoverd_pid = fork();
2476 if (ctdb->recoverd_pid == -1) {
2480 if (ctdb->recoverd_pid != 0) {
2487 /* shutdown the transport */
2488 ctdb->methods->shutdown(ctdb);
2490 /* get a new event context */
2491 talloc_free(ctdb->ev);
2492 ctdb->ev = event_context_init(ctdb);
2494 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2495 ctdb_recoverd_parent, &fd[0]);
2497 close(ctdb->daemon.sd);
2498 ctdb->daemon.sd = -1;
2500 srandom(getpid() ^ time(NULL));
2502 /* initialise ctdb */
2503 ret = ctdb_socket_connect(ctdb);
2505 DEBUG(0, (__location__ " Failed to init ctdb\n"));
2509 monitor_cluster(ctdb);
2511 DEBUG(0,("ERROR: ctdb_recoverd finished!?\n"));
2516 shutdown the recovery daemon
2518 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
2520 if (ctdb->recoverd_pid == 0) {
2524 DEBUG(0,("Shutting down recovery daemon\n"));
2525 kill(ctdb->recoverd_pid, SIGTERM);