4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
26 #include "../include/ctdb.h"
27 #include "../include/ctdb_private.h"
31 struct ctdb_recoverd *rec;
36 private state of recovery daemon
38 struct ctdb_recoverd {
39 struct ctdb_context *ctdb;
40 uint32_t last_culprit;
41 uint32_t culprit_counter;
42 struct timeval first_recover_time;
43 struct ban_state **banned_nodes;
44 struct timeval priority_time;
47 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
48 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
53 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t vnn)
55 struct ctdb_context *ctdb = rec->ctdb;
57 if (!ctdb_validate_pnn(ctdb, vnn)) {
58 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", vnn));
62 if (rec->banned_nodes[vnn] == NULL) {
66 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), vnn, 0, NODE_FLAGS_BANNED);
68 talloc_free(rec->banned_nodes[vnn]);
69 rec->banned_nodes[vnn] = NULL;
74 called when a ban has timed out
76 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
78 struct ban_state *state = talloc_get_type(p, struct ban_state);
79 struct ctdb_recoverd *rec = state->rec;
80 uint32_t vnn = state->banned_node;
82 DEBUG(0,("Node %u is now unbanned\n", vnn));
83 ctdb_unban_node(rec, vnn);
87 ban a node for a period of time
89 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t vnn, uint32_t ban_time)
91 struct ctdb_context *ctdb = rec->ctdb;
93 if (!ctdb_validate_pnn(ctdb, vnn)) {
94 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", vnn));
98 if (vnn == ctdb->pnn) {
99 DEBUG(0,("self ban - lowering our election priority\n"));
100 /* banning ourselves - lower our election priority */
101 rec->priority_time = timeval_current();
104 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), vnn, NODE_FLAGS_BANNED, 0);
106 rec->banned_nodes[vnn] = talloc(rec, struct ban_state);
107 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[vnn]);
109 rec->banned_nodes[vnn]->rec = rec;
110 rec->banned_nodes[vnn]->banned_node = vnn;
113 event_add_timed(ctdb->ev, rec->banned_nodes[vnn],
114 timeval_current_ofs(ban_time, 0),
115 ctdb_ban_timeout, rec->banned_nodes[vnn]);
119 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
122 struct freeze_node_data {
124 enum monitor_result status;
128 static void freeze_node_callback(struct ctdb_client_control_state *state)
130 struct freeze_node_data *fndata = talloc_get_type(state->async.private, struct freeze_node_data);
133 /* one more node has responded to our freeze node*/
136 /* if we failed to freeze the node, we must trigger another recovery */
137 if ( (state->state != CTDB_CONTROL_DONE) || (state->status != 0) ) {
138 DEBUG(0, (__location__ " Failed to freeze node:%u. recovery failed\n", state->c->hdr.destnode));
139 fndata->status = MONITOR_RECOVERY_NEEDED;
147 /* freeze all nodes */
148 static enum monitor_result freeze_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
150 struct freeze_node_data *fndata;
151 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
152 struct ctdb_client_control_state *state;
153 enum monitor_result status;
156 fndata = talloc(mem_ctx, struct freeze_node_data);
157 CTDB_NO_MEMORY_FATAL(ctdb, fndata);
159 fndata->status = MONITOR_OK;
161 /* loop over all active nodes and send an async freeze call to
163 for (j=0; j<nodemap->num; j++) {
164 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
167 state = ctdb_ctrl_freeze_send(ctdb, mem_ctx,
169 nodemap->nodes[j].pnn);
171 /* we failed to send the control, treat this as
172 an error and try again next iteration
174 DEBUG(0,("Failed to call ctdb_ctrl_freeze_send during recovery\n"));
175 talloc_free(mem_ctx);
176 return MONITOR_RECOVERY_NEEDED;
179 /* set up the callback functions */
180 state->async.fn = freeze_node_callback;
181 state->async.private = fndata;
183 /* one more control to wait for to complete */
188 /* now wait for up to the maximum number of seconds allowed
189 or until all nodes we expect a response from has replied
191 while (fndata->count > 0) {
192 event_loop_once(ctdb->ev);
195 status = fndata->status;
196 talloc_free(mem_ctx);
202 change recovery mode on all nodes
204 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
208 /* freeze all nodes */
209 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
210 ret = freeze_all_nodes(ctdb, nodemap);
211 if (ret != MONITOR_OK) {
212 DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
218 /* set recovery mode to active on all nodes */
219 for (j=0; j<nodemap->num; j++) {
220 /* dont change it for nodes that are unavailable */
221 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
225 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, rec_mode);
227 DEBUG(0, (__location__ " Unable to set recmode on node %u\n", nodemap->nodes[j].pnn));
231 if (rec_mode == CTDB_RECOVERY_NORMAL) {
232 ret = ctdb_ctrl_thaw(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn);
234 DEBUG(0, (__location__ " Unable to thaw node %u\n", nodemap->nodes[j].pnn));
244 change recovery master on all node
246 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t vnn)
250 /* set recovery master to vnn on all nodes */
251 for (j=0; j<nodemap->num; j++) {
252 /* dont change it for nodes that are unavailable */
253 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
257 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, vnn);
259 DEBUG(0, (__location__ " Unable to set recmaster on node %u\n", nodemap->nodes[j].pnn));
269 ensure all other nodes have attached to any databases that we have
271 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
272 uint32_t vnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
275 struct ctdb_dbid_map *remote_dbmap;
277 /* verify that all other nodes have all our databases */
278 for (j=0; j<nodemap->num; j++) {
279 /* we dont need to ourself ourselves */
280 if (nodemap->nodes[j].pnn == vnn) {
283 /* dont check nodes that are unavailable */
284 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
288 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
289 mem_ctx, &remote_dbmap);
291 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", vnn));
295 /* step through all local databases */
296 for (db=0; db<dbmap->num;db++) {
300 for (i=0;i<remote_dbmap->num;i++) {
301 if (dbmap->dbids[db] == remote_dbmap->dbids[i]) {
305 /* the remote node already have this database */
306 if (i!=remote_dbmap->num) {
309 /* ok so we need to create this database */
310 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), vnn, dbmap->dbids[db], mem_ctx, &name);
312 DEBUG(0, (__location__ " Unable to get dbname from node %u\n", vnn));
315 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, name);
317 DEBUG(0, (__location__ " Unable to create remote db:%s\n", name));
328 ensure we are attached to any databases that anyone else is attached to
330 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
331 uint32_t vnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
334 struct ctdb_dbid_map *remote_dbmap;
336 /* verify that we have all database any other node has */
337 for (j=0; j<nodemap->num; j++) {
338 /* we dont need to ourself ourselves */
339 if (nodemap->nodes[j].pnn == vnn) {
342 /* dont check nodes that are unavailable */
343 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
347 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
348 mem_ctx, &remote_dbmap);
350 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", vnn));
354 /* step through all databases on the remote node */
355 for (db=0; db<remote_dbmap->num;db++) {
358 for (i=0;i<(*dbmap)->num;i++) {
359 if (remote_dbmap->dbids[db] == (*dbmap)->dbids[i]) {
363 /* we already have this db locally */
364 if (i!=(*dbmap)->num) {
367 /* ok so we need to create this database and
370 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
371 remote_dbmap->dbids[db], mem_ctx, &name);
373 DEBUG(0, (__location__ " Unable to get dbname from node %u\n",
374 nodemap->nodes[j].pnn));
377 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, name);
379 DEBUG(0, (__location__ " Unable to create local db:%s\n", name));
382 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, dbmap);
384 DEBUG(0, (__location__ " Unable to reread dbmap on node %u\n", vnn));
395 pull all the remote database contents into ours
397 static int pull_all_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
398 uint32_t vnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
402 /* pull all records from all other nodes across onto this node
403 (this merges based on rsn)
405 for (i=0;i<dbmap->num;i++) {
406 for (j=0; j<nodemap->num; j++) {
407 /* we dont need to merge with ourselves */
408 if (nodemap->nodes[j].pnn == vnn) {
411 /* dont merge from nodes that are unavailable */
412 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
415 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
416 vnn, dbmap->dbids[i], CTDB_LMASTER_ANY, mem_ctx);
418 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
419 nodemap->nodes[j].pnn, vnn));
430 change the dmaster on all databases to point to us
432 static int update_dmaster_on_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
433 uint32_t vnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
437 /* update dmaster to point to this node for all databases/nodes */
438 for (i=0;i<dbmap->num;i++) {
439 for (j=0; j<nodemap->num; j++) {
440 /* dont repoint nodes that are unavailable */
441 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
444 ret = ctdb_ctrl_setdmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, ctdb, dbmap->dbids[i], vnn);
446 DEBUG(0, (__location__ " Unable to set dmaster for node %u db:0x%08x\n", nodemap->nodes[j].pnn, dbmap->dbids[i]));
457 update flags on all active nodes
459 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
462 for (i=0;i<nodemap->num;i++) {
463 struct ctdb_node_flag_change c;
466 c.vnn = nodemap->nodes[i].pnn;
467 c.old_flags = nodemap->nodes[i].flags;
468 c.new_flags = nodemap->nodes[i].flags;
470 data.dptr = (uint8_t *)&c;
471 data.dsize = sizeof(c);
473 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
474 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
483 static int vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *nodemap)
488 /* find max rsn on our local node for this db */
489 ret = ctdb_ctrl_get_max_rsn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, db_id, &max_rsn);
494 /* set rsn on non-empty records to max_rsn+1 */
495 for (i=0;i<nodemap->num;i++) {
496 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
499 ret = ctdb_ctrl_set_rsn_nonempty(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
502 DEBUG(0,(__location__ " Failed to set rsn on node %u to %llu\n",
503 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
508 /* delete records with rsn < max_rsn+1 on all nodes */
509 for (i=0;i<nodemap->num;i++) {
510 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
513 ret = ctdb_ctrl_delete_low_rsn(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
516 DEBUG(0,(__location__ " Failed to delete records on node %u with rsn below %llu\n",
517 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
528 vacuum all attached databases
530 static int vacuum_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
531 struct ctdb_dbid_map *dbmap)
535 /* update dmaster to point to this node for all databases/nodes */
536 for (i=0;i<dbmap->num;i++) {
537 if (vacuum_db(ctdb, dbmap->dbids[i], nodemap) != 0) {
546 push out all our database contents to all other nodes
548 static int push_all_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
549 uint32_t vnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
553 /* push all records out to the nodes again */
554 for (i=0;i<dbmap->num;i++) {
555 for (j=0; j<nodemap->num; j++) {
556 /* we dont need to push to ourselves */
557 if (nodemap->nodes[j].pnn == vnn) {
560 /* dont push to nodes that are unavailable */
561 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
564 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), vnn, nodemap->nodes[j].pnn,
565 dbmap->dbids[i], CTDB_LMASTER_ANY, mem_ctx);
567 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
568 vnn, nodemap->nodes[j].pnn));
579 ensure all nodes have the same vnnmap we do
581 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
582 uint32_t vnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
586 /* push the new vnn map out to all the nodes */
587 for (j=0; j<nodemap->num; j++) {
588 /* dont push to nodes that are unavailable */
589 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
593 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
595 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", vnn));
605 handler for when the admin bans a node
607 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
608 TDB_DATA data, void *private_data)
610 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
611 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
612 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
616 if (data.dsize != sizeof(*b)) {
617 DEBUG(0,("Bad data in ban_handler\n"));
618 talloc_free(mem_ctx);
622 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
624 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
625 talloc_free(mem_ctx);
629 if (recmaster != ctdb->pnn) {
630 DEBUG(0,("We are not the recmaster - ignoring ban request\n"));
631 talloc_free(mem_ctx);
635 DEBUG(0,("Node %u has been banned for %u seconds by the administrator\n",
636 b->vnn, b->ban_time));
637 ctdb_ban_node(rec, b->vnn, b->ban_time);
638 talloc_free(mem_ctx);
642 handler for when the admin unbans a node
644 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
645 TDB_DATA data, void *private_data)
647 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
648 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
653 if (data.dsize != sizeof(uint32_t)) {
654 DEBUG(0,("Bad data in unban_handler\n"));
655 talloc_free(mem_ctx);
658 vnn = *(uint32_t *)data.dptr;
660 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
662 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
663 talloc_free(mem_ctx);
667 if (recmaster != ctdb->pnn) {
668 DEBUG(0,("We are not the recmaster - ignoring unban request\n"));
669 talloc_free(mem_ctx);
673 DEBUG(0,("Node %u has been unbanned by the administrator\n", vnn));
674 ctdb_unban_node(rec, vnn);
675 talloc_free(mem_ctx);
681 called when ctdb_wait_timeout should finish
683 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
684 struct timeval yt, void *p)
686 uint32_t *timed_out = (uint32_t *)p;
691 wait for a given number of seconds
693 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
695 uint32_t timed_out = 0;
696 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
698 event_loop_once(ctdb->ev);
702 /* Create a new random generation ip.
703 The generation id can not be the INVALID_GENERATION id
705 static uint32_t new_generation(void)
710 generation = random();
712 if (generation != INVALID_GENERATION) {
721 we are the recmaster, and recovery is needed - start a recovery run
723 static int do_recovery(struct ctdb_recoverd *rec,
724 TALLOC_CTX *mem_ctx, uint32_t vnn, uint32_t num_active,
725 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
728 struct ctdb_context *ctdb = rec->ctdb;
731 struct ctdb_dbid_map *dbmap;
733 if (rec->last_culprit != culprit ||
734 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
735 /* either a new node is the culprit, or we've decide to forgive them */
736 rec->last_culprit = culprit;
737 rec->first_recover_time = timeval_current();
738 rec->culprit_counter = 0;
740 rec->culprit_counter++;
742 if (rec->culprit_counter > 2*nodemap->num) {
743 DEBUG(0,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
744 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
745 ctdb->tunable.recovery_ban_period));
746 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
749 if (!ctdb_recovery_lock(ctdb, true)) {
750 DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
754 /* set recovery mode to active on all nodes */
755 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
757 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
761 DEBUG(0, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
763 /* pick a new generation number */
764 generation = new_generation();
766 /* change the vnnmap on this node to use the new generation
767 number but not on any other nodes.
768 this guarantees that if we abort the recovery prematurely
769 for some reason (a node stops responding?)
770 that we can just return immediately and we will reenter
771 recovery shortly again.
772 I.e. we deliberately leave the cluster with an inconsistent
773 generation id to allow us to abort recovery at any stage and
774 just restart it from scratch.
776 vnnmap->generation = generation;
777 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, vnnmap);
779 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", vnn));
783 /* get a list of all databases */
784 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, &dbmap);
786 DEBUG(0, (__location__ " Unable to get dbids from node :%u\n", vnn));
792 /* verify that all other nodes have all our databases */
793 ret = create_missing_remote_databases(ctdb, nodemap, vnn, dbmap, mem_ctx);
795 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
799 /* verify that we have all the databases any other node has */
800 ret = create_missing_local_databases(ctdb, nodemap, vnn, &dbmap, mem_ctx);
802 DEBUG(0, (__location__ " Unable to create missing local databases\n"));
808 /* verify that all other nodes have all our databases */
809 ret = create_missing_remote_databases(ctdb, nodemap, vnn, dbmap, mem_ctx);
811 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
816 DEBUG(1, (__location__ " Recovery - created remote databases\n"));
818 /* pull all remote databases onto the local node */
819 ret = pull_all_remote_databases(ctdb, nodemap, vnn, dbmap, mem_ctx);
821 DEBUG(0, (__location__ " Unable to pull remote databases\n"));
825 DEBUG(1, (__location__ " Recovery - pulled remote databases\n"));
827 /* push all local databases to the remote nodes */
828 ret = push_all_local_databases(ctdb, nodemap, vnn, dbmap, mem_ctx);
830 DEBUG(0, (__location__ " Unable to push local databases\n"));
834 DEBUG(1, (__location__ " Recovery - pushed remote databases\n"));
836 /* build a new vnn map with all the currently active and
838 generation = new_generation();
839 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
840 CTDB_NO_MEMORY(ctdb, vnnmap);
841 vnnmap->generation = generation;
842 vnnmap->size = num_active;
843 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
844 for (i=j=0;i<nodemap->num;i++) {
845 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
846 vnnmap->map[j++] = nodemap->nodes[i].pnn;
852 /* update to the new vnnmap on all nodes */
853 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, vnn, vnnmap, mem_ctx);
855 DEBUG(0, (__location__ " Unable to update vnnmap on all nodes\n"));
859 DEBUG(1, (__location__ " Recovery - updated vnnmap\n"));
861 /* update recmaster to point to us for all nodes */
862 ret = set_recovery_master(ctdb, nodemap, vnn);
864 DEBUG(0, (__location__ " Unable to set recovery master\n"));
868 DEBUG(1, (__location__ " Recovery - updated recmaster\n"));
870 /* repoint all local and remote database records to the local
871 node as being dmaster
873 ret = update_dmaster_on_all_databases(ctdb, nodemap, vnn, dbmap, mem_ctx);
875 DEBUG(0, (__location__ " Unable to update dmaster on all databases\n"));
879 DEBUG(1, (__location__ " Recovery - updated dmaster on all databases\n"));
882 update all nodes to have the same flags that we have
884 ret = update_flags_on_all_nodes(ctdb, nodemap);
886 DEBUG(0, (__location__ " Unable to update flags on all nodes\n"));
890 DEBUG(1, (__location__ " Recovery - updated flags\n"));
893 run a vacuum operation on empty records
895 ret = vacuum_all_databases(ctdb, nodemap, dbmap);
897 DEBUG(0, (__location__ " Unable to vacuum all databases\n"));
901 DEBUG(1, (__location__ " Recovery - vacuumed all databases\n"));
904 if enabled, tell nodes to takeover their public IPs
906 if (ctdb->vnn_list) {
907 ret = ctdb_takeover_run(ctdb, nodemap);
909 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
912 DEBUG(1, (__location__ " Recovery - done takeover\n"));
916 /* disable recovery mode */
917 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
919 DEBUG(0, (__location__ " Unable to set recovery mode to normal on cluster\n"));
923 /* send a message to all clients telling them that the cluster
924 has been reconfigured */
925 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
927 DEBUG(0, (__location__ " Recovery complete\n"));
929 /* We just finished a recovery successfully.
930 We now wait for rerecovery_timeout before we allow
931 another recovery to take place.
933 DEBUG(0, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
934 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
935 DEBUG(0, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
942 elections are won by first checking the number of connected nodes, then
943 the priority time, then the vnn
945 struct election_message {
946 uint32_t num_connected;
947 struct timeval priority_time;
952 form this nodes election data
954 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
957 struct ctdb_node_map *nodemap;
958 struct ctdb_context *ctdb = rec->ctdb;
962 em->vnn = rec->ctdb->pnn;
963 em->priority_time = rec->priority_time;
965 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
970 for (i=0;i<nodemap->num;i++) {
971 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
975 talloc_free(nodemap);
979 see if the given election data wins
981 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
983 struct election_message myem;
986 ctdb_election_data(rec, &myem);
988 /* try to use the most connected node */
989 cmp = (int)myem.num_connected - (int)em->num_connected;
991 /* then the longest running node */
993 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
997 cmp = (int)myem.vnn - (int)em->vnn;
1004 send out an election request
1006 static int send_election_request(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t vnn)
1009 TDB_DATA election_data;
1010 struct election_message emsg;
1012 struct ctdb_context *ctdb = rec->ctdb;
1014 srvid = CTDB_SRVID_RECOVERY;
1016 ctdb_election_data(rec, &emsg);
1018 election_data.dsize = sizeof(struct election_message);
1019 election_data.dptr = (unsigned char *)&emsg;
1022 /* first we assume we will win the election and set
1023 recoverymaster to be ourself on the current node
1025 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), vnn, vnn);
1027 DEBUG(0, (__location__ " failed to send recmaster election request\n"));
1032 /* send an election message to all active nodes */
1033 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1039 this function will unban all nodes in the cluster
1041 static void unban_all_nodes(struct ctdb_context *ctdb)
1044 struct ctdb_node_map *nodemap;
1045 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1047 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1049 DEBUG(0,(__location__ " failed to get nodemap to unban all nodes\n"));
1053 for (i=0;i<nodemap->num;i++) {
1054 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1055 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1056 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1060 talloc_free(tmp_ctx);
1064 handler for recovery master elections
1066 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1067 TDB_DATA data, void *private_data)
1069 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1071 struct election_message *em = (struct election_message *)data.dptr;
1072 TALLOC_CTX *mem_ctx;
1074 mem_ctx = talloc_new(ctdb);
1076 /* someone called an election. check their election data
1077 and if we disagree and we would rather be the elected node,
1078 send a new election message to all other nodes
1080 if (ctdb_election_win(rec, em)) {
1081 ret = send_election_request(rec, mem_ctx, ctdb_get_pnn(ctdb));
1083 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1085 talloc_free(mem_ctx);
1086 /*unban_all_nodes(ctdb);*/
1090 /* release the recmaster lock */
1091 if (em->vnn != ctdb->pnn &&
1092 ctdb->recovery_lock_fd != -1) {
1093 close(ctdb->recovery_lock_fd);
1094 ctdb->recovery_lock_fd = -1;
1095 unban_all_nodes(ctdb);
1098 /* ok, let that guy become recmaster then */
1099 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->vnn);
1101 DEBUG(0, (__location__ " failed to send recmaster election request"));
1102 talloc_free(mem_ctx);
1106 /* release any bans */
1107 rec->last_culprit = (uint32_t)-1;
1108 talloc_free(rec->banned_nodes);
1109 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1110 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1112 talloc_free(mem_ctx);
1118 force the start of the election process
1120 static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t vnn,
1121 struct ctdb_node_map *nodemap)
1124 struct ctdb_context *ctdb = rec->ctdb;
1126 /* set all nodes to recovery mode to stop all internode traffic */
1127 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1129 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1133 ret = send_election_request(rec, mem_ctx, vnn);
1135 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1139 /* wait for a few seconds to collect all responses */
1140 ctdb_wait_timeout(ctdb, ctdb->tunable.election_timeout);
1146 handler for when a node changes its flags
1148 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1149 TDB_DATA data, void *private_data)
1152 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1153 struct ctdb_node_map *nodemap=NULL;
1154 TALLOC_CTX *tmp_ctx;
1155 uint32_t changed_flags;
1158 if (data.dsize != sizeof(*c)) {
1159 DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1163 tmp_ctx = talloc_new(ctdb);
1164 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1166 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1168 for (i=0;i<nodemap->num;i++) {
1169 if (nodemap->nodes[i].pnn == c->vnn) break;
1172 if (i == nodemap->num) {
1173 DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->vnn));
1174 talloc_free(tmp_ctx);
1178 changed_flags = c->old_flags ^ c->new_flags;
1180 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1181 This flag is handled locally based on whether the local node
1182 can communicate with the node or not.
1184 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1185 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1186 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1189 if (nodemap->nodes[i].flags != c->new_flags) {
1190 DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->vnn, c->new_flags, c->old_flags));
1193 nodemap->nodes[i].flags = c->new_flags;
1195 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1196 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1199 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1200 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1204 ctdb->recovery_master == ctdb->pnn &&
1205 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
1207 /* Only do the takeover run if the perm disabled or unhealthy
1208 flags changed since these will cause an ip failover but not
1210 If the node became disconnected or banned this will also
1211 lead to an ip address failover but that is handled
1214 if (changed_flags & NODE_FLAGS_DISABLED) {
1215 ret = ctdb_takeover_run(ctdb, nodemap);
1217 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
1219 /* send a message to all clients telling them that the
1220 cluster has been reconfigured */
1221 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1225 talloc_free(tmp_ctx);
1230 struct verify_recmode_normal_data {
1232 enum monitor_result status;
1235 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1237 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private, struct verify_recmode_normal_data);
1240 /* one more node has responded with recmode data*/
1243 /* if we failed to get the recmode, then return an error and let
1244 the main loop try again.
1246 if (state->state != CTDB_CONTROL_DONE) {
1247 if (rmdata->status == MONITOR_OK) {
1248 rmdata->status = MONITOR_FAILED;
1253 /* if we got a response, then the recmode will be stored in the
1256 if (state->status != CTDB_RECOVERY_NORMAL) {
1257 DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1258 rmdata->status = MONITOR_RECOVERY_NEEDED;
1265 /* verify that all nodes are in normal recovery mode */
1266 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1268 struct verify_recmode_normal_data *rmdata;
1269 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1270 struct ctdb_client_control_state *state;
1271 enum monitor_result status;
1274 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1275 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1277 rmdata->status = MONITOR_OK;
1279 /* loop over all active nodes and send an async getrecmode call to
1281 for (j=0; j<nodemap->num; j++) {
1282 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1285 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1287 nodemap->nodes[j].pnn);
1288 if (state == NULL) {
1289 /* we failed to send the control, treat this as
1290 an error and try again next iteration
1292 DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1293 talloc_free(mem_ctx);
1294 return MONITOR_FAILED;
1297 /* set up the callback functions */
1298 state->async.fn = verify_recmode_normal_callback;
1299 state->async.private = rmdata;
1301 /* one more control to wait for to complete */
1306 /* now wait for up to the maximum number of seconds allowed
1307 or until all nodes we expect a response from has replied
1309 while (rmdata->count > 0) {
1310 event_loop_once(ctdb->ev);
1313 status = rmdata->status;
1314 talloc_free(mem_ctx);
1319 struct verify_recmaster_data {
1322 enum monitor_result status;
1325 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1327 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private, struct verify_recmaster_data);
1330 /* one more node has responded with recmaster data*/
1333 /* if we failed to get the recmaster, then return an error and let
1334 the main loop try again.
1336 if (state->state != CTDB_CONTROL_DONE) {
1337 if (rmdata->status == MONITOR_OK) {
1338 rmdata->status = MONITOR_FAILED;
1343 /* if we got a response, then the recmaster will be stored in the
1346 if (state->status != rmdata->vnn) {
1347 DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
1348 rmdata->status = MONITOR_ELECTION_NEEDED;
1355 /* verify that all nodes agree that we are the recmaster */
1356 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t vnn)
1358 struct verify_recmaster_data *rmdata;
1359 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1360 struct ctdb_client_control_state *state;
1361 enum monitor_result status;
1364 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1365 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1368 rmdata->status = MONITOR_OK;
1370 /* loop over all active nodes and send an async getrecmaster call to
1372 for (j=0; j<nodemap->num; j++) {
1373 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1376 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1378 nodemap->nodes[j].pnn);
1379 if (state == NULL) {
1380 /* we failed to send the control, treat this as
1381 an error and try again next iteration
1383 DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1384 talloc_free(mem_ctx);
1385 return MONITOR_FAILED;
1388 /* set up the callback functions */
1389 state->async.fn = verify_recmaster_callback;
1390 state->async.private = rmdata;
1392 /* one more control to wait for to complete */
1397 /* now wait for up to the maximum number of seconds allowed
1398 or until all nodes we expect a response from has replied
1400 while (rmdata->count > 0) {
1401 event_loop_once(ctdb->ev);
1404 status = rmdata->status;
1405 talloc_free(mem_ctx);
1411 the main monitoring loop
1413 static void monitor_cluster(struct ctdb_context *ctdb)
1415 uint32_t vnn, num_active, recmaster;
1416 TALLOC_CTX *mem_ctx=NULL;
1417 struct ctdb_node_map *nodemap=NULL;
1418 struct ctdb_node_map *remote_nodemap=NULL;
1419 struct ctdb_vnn_map *vnnmap=NULL;
1420 struct ctdb_vnn_map *remote_vnnmap=NULL;
1422 bool need_takeover_run;
1423 struct ctdb_recoverd *rec;
1425 rec = talloc_zero(ctdb, struct ctdb_recoverd);
1426 CTDB_NO_MEMORY_FATAL(ctdb, rec);
1429 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1430 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1432 rec->priority_time = timeval_current();
1434 /* register a message port for recovery elections */
1435 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
1437 /* and one for when nodes are disabled/enabled */
1438 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
1440 /* and one for when nodes are banned */
1441 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
1443 /* and one for when nodes are unbanned */
1444 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
1447 need_takeover_run = false;
1450 talloc_free(mem_ctx);
1453 mem_ctx = talloc_new(ctdb);
1455 DEBUG(0,("Failed to create temporary context\n"));
1459 /* we only check for recovery once every second */
1460 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
1462 /* get relevant tunables */
1463 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
1465 DEBUG(0,("Failed to get tunables - retrying\n"));
1469 vnn = ctdb_ctrl_getvnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1470 if (vnn == (uint32_t)-1) {
1471 DEBUG(0,("Failed to get local vnn - retrying\n"));
1475 /* get the vnnmap */
1476 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, &vnnmap);
1478 DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", vnn));
1483 /* get number of nodes */
1484 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, &nodemap);
1486 DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", vnn));
1491 /* count how many active nodes there are */
1493 for (i=0; i<nodemap->num; i++) {
1494 if (rec->banned_nodes[nodemap->nodes[i].pnn] != NULL) {
1495 nodemap->nodes[i].flags |= NODE_FLAGS_BANNED;
1497 nodemap->nodes[i].flags &= ~NODE_FLAGS_BANNED;
1499 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1505 /* check which node is the recovery master */
1506 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), vnn, &recmaster);
1508 DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", vnn));
1512 if (recmaster == (uint32_t)-1) {
1513 DEBUG(0,(__location__ " Initial recovery master set - forcing election\n"));
1514 force_election(rec, mem_ctx, vnn, nodemap);
1518 /* verify that the recmaster node is still active */
1519 for (j=0; j<nodemap->num; j++) {
1520 if (nodemap->nodes[j].pnn==recmaster) {
1525 if (j == nodemap->num) {
1526 DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster));
1527 force_election(rec, mem_ctx, vnn, nodemap);
1531 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1532 DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
1533 force_election(rec, mem_ctx, vnn, nodemap);
1538 /* if we are not the recmaster then we do not need to check
1539 if recovery is needed
1541 if (vnn!=recmaster) {
1546 /* verify that all active nodes agree that we are the recmaster */
1547 switch (verify_recmaster(ctdb, nodemap, vnn)) {
1548 case MONITOR_RECOVERY_NEEDED:
1549 /* can not happen */
1551 case MONITOR_ELECTION_NEEDED:
1552 force_election(rec, mem_ctx, vnn, nodemap);
1556 case MONITOR_FAILED:
1561 /* verify that all active nodes are in normal mode
1562 and not in recovery mode
1564 switch (verify_recmode(ctdb, nodemap)) {
1565 case MONITOR_RECOVERY_NEEDED:
1566 do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1568 case MONITOR_FAILED:
1570 case MONITOR_ELECTION_NEEDED:
1571 /* can not happen */
1578 /* get the nodemap for all active remote nodes and verify
1579 they are the same as for this node
1581 for (j=0; j<nodemap->num; j++) {
1582 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1585 if (nodemap->nodes[j].pnn == vnn) {
1589 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1590 mem_ctx, &remote_nodemap);
1592 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
1593 nodemap->nodes[j].pnn));
1597 /* if the nodes disagree on how many nodes there are
1598 then this is a good reason to try recovery
1600 if (remote_nodemap->num != nodemap->num) {
1601 DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
1602 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
1603 do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1607 /* if the nodes disagree on which nodes exist and are
1608 active, then that is also a good reason to do recovery
1610 for (i=0;i<nodemap->num;i++) {
1611 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
1612 DEBUG(0, (__location__ " Remote node:%u has different nodemap vnn for %d (%u vs %u).\n",
1613 nodemap->nodes[j].pnn, i,
1614 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
1615 do_recovery(rec, mem_ctx, vnn, num_active, nodemap,
1616 vnnmap, nodemap->nodes[j].pnn);
1619 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
1620 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1621 DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
1622 nodemap->nodes[j].pnn, i,
1623 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
1624 do_recovery(rec, mem_ctx, vnn, num_active, nodemap,
1625 vnnmap, nodemap->nodes[j].pnn);
1630 /* update our nodemap flags according to the other
1631 server - this gets the NODE_FLAGS_DISABLED
1632 flag. Note that the remote node is authoritative
1633 for its flags (except CONNECTED, which we know
1634 matches in this code) */
1635 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1636 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1637 need_takeover_run = true;
1642 /* there better be the same number of lmasters in the vnn map
1643 as there are active nodes or we will have to do a recovery
1645 if (vnnmap->size != num_active) {
1646 DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
1647 vnnmap->size, num_active));
1648 do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, ctdb->pnn);
1652 /* verify that all active nodes in the nodemap also exist in
1655 for (j=0; j<nodemap->num; j++) {
1656 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1659 if (nodemap->nodes[j].pnn == vnn) {
1663 for (i=0; i<vnnmap->size; i++) {
1664 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
1668 if (i == vnnmap->size) {
1669 DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
1670 nodemap->nodes[j].pnn));
1671 do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1677 /* verify that all other nodes have the same vnnmap
1678 and are from the same generation
1680 for (j=0; j<nodemap->num; j++) {
1681 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1684 if (nodemap->nodes[j].pnn == vnn) {
1688 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1689 mem_ctx, &remote_vnnmap);
1691 DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n",
1692 nodemap->nodes[j].pnn));
1696 /* verify the vnnmap generation is the same */
1697 if (vnnmap->generation != remote_vnnmap->generation) {
1698 DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
1699 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
1700 do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1704 /* verify the vnnmap size is the same */
1705 if (vnnmap->size != remote_vnnmap->size) {
1706 DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
1707 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
1708 do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1712 /* verify the vnnmap is the same */
1713 for (i=0;i<vnnmap->size;i++) {
1714 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
1715 DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n",
1716 nodemap->nodes[j].pnn));
1717 do_recovery(rec, mem_ctx, vnn, num_active, nodemap,
1718 vnnmap, nodemap->nodes[j].pnn);
1724 /* we might need to change who has what IP assigned */
1725 if (need_takeover_run && ctdb->vnn_list) {
1726 ret = ctdb_takeover_run(ctdb, nodemap);
1728 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
1737 event handler for when the main ctdbd dies
1739 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
1740 uint16_t flags, void *private_data)
1742 DEBUG(0,("recovery daemon parent died - exiting\n"));
1749 startup the recovery daemon as a child of the main ctdb daemon
1751 int ctdb_start_recoverd(struct ctdb_context *ctdb)
1757 if (pipe(fd) != 0) {
1773 /* shutdown the transport */
1774 ctdb->methods->shutdown(ctdb);
1776 /* get a new event context */
1777 talloc_free(ctdb->ev);
1778 ctdb->ev = event_context_init(ctdb);
1780 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
1781 ctdb_recoverd_parent, &fd[0]);
1783 close(ctdb->daemon.sd);
1784 ctdb->daemon.sd = -1;
1786 srandom(getpid() ^ time(NULL));
1788 /* initialise ctdb */
1789 ret = ctdb_socket_connect(ctdb);
1791 DEBUG(0, (__location__ " Failed to init ctdb\n"));
1795 monitor_cluster(ctdb);
1797 DEBUG(0,("ERROR: ctdb_recoverd finished!?\n"));