4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
26 #include "../include/ctdb.h"
27 #include "../include/ctdb_private.h"
31 struct ctdb_recoverd *rec;
36 private state of recovery daemon
38 struct ctdb_recoverd {
39 struct ctdb_context *ctdb;
40 uint32_t last_culprit;
41 uint32_t culprit_counter;
42 struct timeval first_recover_time;
43 struct ban_state **banned_nodes;
44 struct timeval priority_time;
45 bool need_takeover_run;
49 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
50 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
55 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
57 struct ctdb_context *ctdb = rec->ctdb;
59 if (!ctdb_validate_pnn(ctdb, pnn)) {
60 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
64 if (rec->banned_nodes[pnn] == NULL) {
68 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
70 talloc_free(rec->banned_nodes[pnn]);
71 rec->banned_nodes[pnn] = NULL;
76 called when a ban has timed out
78 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
80 struct ban_state *state = talloc_get_type(p, struct ban_state);
81 struct ctdb_recoverd *rec = state->rec;
82 uint32_t pnn = state->banned_node;
84 DEBUG(0,("Node %u is now unbanned\n", pnn));
85 ctdb_unban_node(rec, pnn);
89 ban a node for a period of time
91 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
93 struct ctdb_context *ctdb = rec->ctdb;
95 if (!ctdb_validate_pnn(ctdb, pnn)) {
96 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
100 if (pnn == ctdb->pnn) {
101 DEBUG(0,("self ban - lowering our election priority\n"));
102 /* banning ourselves - lower our election priority */
103 rec->priority_time = timeval_current();
106 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
108 rec->banned_nodes[pnn] = talloc(rec, struct ban_state);
109 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
111 rec->banned_nodes[pnn]->rec = rec;
112 rec->banned_nodes[pnn]->banned_node = pnn;
115 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
116 timeval_current_ofs(ban_time, 0),
117 ctdb_ban_timeout, rec->banned_nodes[pnn]);
121 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
124 struct freeze_node_data {
126 enum monitor_result status;
130 static void freeze_node_callback(struct ctdb_client_control_state *state)
132 struct freeze_node_data *fndata = talloc_get_type(state->async.private, struct freeze_node_data);
135 /* one more node has responded to our freeze node*/
138 /* if we failed to freeze the node, we must trigger another recovery */
139 if ( (state->state != CTDB_CONTROL_DONE) || (state->status != 0) ) {
140 DEBUG(0, (__location__ " Failed to freeze node:%u. recovery failed\n", state->c->hdr.destnode));
141 fndata->status = MONITOR_RECOVERY_NEEDED;
149 /* freeze all nodes */
150 static enum monitor_result freeze_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
152 struct freeze_node_data *fndata;
153 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
154 struct ctdb_client_control_state *state;
155 enum monitor_result status;
158 fndata = talloc(mem_ctx, struct freeze_node_data);
159 CTDB_NO_MEMORY_FATAL(ctdb, fndata);
161 fndata->status = MONITOR_OK;
163 /* loop over all active nodes and send an async freeze call to
165 for (j=0; j<nodemap->num; j++) {
166 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
169 state = ctdb_ctrl_freeze_send(ctdb, mem_ctx,
171 nodemap->nodes[j].pnn);
173 /* we failed to send the control, treat this as
174 an error and try again next iteration
176 DEBUG(0,("Failed to call ctdb_ctrl_freeze_send during recovery\n"));
177 talloc_free(mem_ctx);
178 return MONITOR_RECOVERY_NEEDED;
181 /* set up the callback functions */
182 state->async.fn = freeze_node_callback;
183 state->async.private = fndata;
185 /* one more control to wait for to complete */
190 /* now wait for up to the maximum number of seconds allowed
191 or until all nodes we expect a response from has replied
193 while (fndata->count > 0) {
194 event_loop_once(ctdb->ev);
197 status = fndata->status;
198 talloc_free(mem_ctx);
204 change recovery mode on all nodes
206 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
210 /* freeze all nodes */
211 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
212 ret = freeze_all_nodes(ctdb, nodemap);
213 if (ret != MONITOR_OK) {
214 DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
220 /* set recovery mode to active on all nodes */
221 for (j=0; j<nodemap->num; j++) {
222 /* dont change it for nodes that are unavailable */
223 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
227 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, rec_mode);
229 DEBUG(0, (__location__ " Unable to set recmode on node %u\n", nodemap->nodes[j].pnn));
233 if (rec_mode == CTDB_RECOVERY_NORMAL) {
234 ret = ctdb_ctrl_thaw(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn);
236 DEBUG(0, (__location__ " Unable to thaw node %u\n", nodemap->nodes[j].pnn));
246 change recovery master on all node
248 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
252 /* set recovery master to pnn on all nodes */
253 for (j=0; j<nodemap->num; j++) {
254 /* dont change it for nodes that are unavailable */
255 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
259 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, pnn);
261 DEBUG(0, (__location__ " Unable to set recmaster on node %u\n", nodemap->nodes[j].pnn));
271 ensure all other nodes have attached to any databases that we have
273 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
274 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
277 struct ctdb_dbid_map *remote_dbmap;
279 /* verify that all other nodes have all our databases */
280 for (j=0; j<nodemap->num; j++) {
281 /* we dont need to ourself ourselves */
282 if (nodemap->nodes[j].pnn == pnn) {
285 /* dont check nodes that are unavailable */
286 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
290 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
291 mem_ctx, &remote_dbmap);
293 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
297 /* step through all local databases */
298 for (db=0; db<dbmap->num;db++) {
302 for (i=0;i<remote_dbmap->num;i++) {
303 if (dbmap->dbids[db] == remote_dbmap->dbids[i]) {
307 /* the remote node already have this database */
308 if (i!=remote_dbmap->num) {
311 /* ok so we need to create this database */
312 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbids[db], mem_ctx, &name);
314 DEBUG(0, (__location__ " Unable to get dbname from node %u\n", pnn));
317 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, name);
319 DEBUG(0, (__location__ " Unable to create remote db:%s\n", name));
330 ensure we are attached to any databases that anyone else is attached to
332 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
333 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
336 struct ctdb_dbid_map *remote_dbmap;
338 /* verify that we have all database any other node has */
339 for (j=0; j<nodemap->num; j++) {
340 /* we dont need to ourself ourselves */
341 if (nodemap->nodes[j].pnn == pnn) {
344 /* dont check nodes that are unavailable */
345 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
349 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
350 mem_ctx, &remote_dbmap);
352 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
356 /* step through all databases on the remote node */
357 for (db=0; db<remote_dbmap->num;db++) {
360 for (i=0;i<(*dbmap)->num;i++) {
361 if (remote_dbmap->dbids[db] == (*dbmap)->dbids[i]) {
365 /* we already have this db locally */
366 if (i!=(*dbmap)->num) {
369 /* ok so we need to create this database and
372 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
373 remote_dbmap->dbids[db], mem_ctx, &name);
375 DEBUG(0, (__location__ " Unable to get dbname from node %u\n",
376 nodemap->nodes[j].pnn));
379 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name);
381 DEBUG(0, (__location__ " Unable to create local db:%s\n", name));
384 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
386 DEBUG(0, (__location__ " Unable to reread dbmap on node %u\n", pnn));
397 pull all the remote database contents into ours
399 static int pull_all_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
400 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
404 /* pull all records from all other nodes across onto this node
405 (this merges based on rsn)
407 for (i=0;i<dbmap->num;i++) {
408 for (j=0; j<nodemap->num; j++) {
409 /* we dont need to merge with ourselves */
410 if (nodemap->nodes[j].pnn == pnn) {
413 /* dont merge from nodes that are unavailable */
414 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
417 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
418 pnn, dbmap->dbids[i], CTDB_LMASTER_ANY, mem_ctx);
420 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
421 nodemap->nodes[j].pnn, pnn));
432 change the dmaster on all databases to point to us
434 static int update_dmaster_on_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
435 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
439 /* update dmaster to point to this node for all databases/nodes */
440 for (i=0;i<dbmap->num;i++) {
441 for (j=0; j<nodemap->num; j++) {
442 /* dont repoint nodes that are unavailable */
443 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
446 ret = ctdb_ctrl_setdmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, ctdb, dbmap->dbids[i], pnn);
448 DEBUG(0, (__location__ " Unable to set dmaster for node %u db:0x%08x\n", nodemap->nodes[j].pnn, dbmap->dbids[i]));
459 update flags on all active nodes
461 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
464 for (i=0;i<nodemap->num;i++) {
465 struct ctdb_node_flag_change c;
468 c.pnn = nodemap->nodes[i].pnn;
469 c.old_flags = nodemap->nodes[i].flags;
470 c.new_flags = nodemap->nodes[i].flags;
472 data.dptr = (uint8_t *)&c;
473 data.dsize = sizeof(c);
475 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
476 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
485 static int vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *nodemap)
490 /* find max rsn on our local node for this db */
491 ret = ctdb_ctrl_get_max_rsn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, db_id, &max_rsn);
496 /* set rsn on non-empty records to max_rsn+1 */
497 for (i=0;i<nodemap->num;i++) {
498 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
501 ret = ctdb_ctrl_set_rsn_nonempty(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
504 DEBUG(0,(__location__ " Failed to set rsn on node %u to %llu\n",
505 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
510 /* delete records with rsn < max_rsn+1 on all nodes */
511 for (i=0;i<nodemap->num;i++) {
512 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
515 ret = ctdb_ctrl_delete_low_rsn(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
518 DEBUG(0,(__location__ " Failed to delete records on node %u with rsn below %llu\n",
519 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
530 vacuum all attached databases
532 static int vacuum_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
533 struct ctdb_dbid_map *dbmap)
537 /* update dmaster to point to this node for all databases/nodes */
538 for (i=0;i<dbmap->num;i++) {
539 if (vacuum_db(ctdb, dbmap->dbids[i], nodemap) != 0) {
548 push out all our database contents to all other nodes
550 static int push_all_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
551 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
555 /* push all records out to the nodes again */
556 for (i=0;i<dbmap->num;i++) {
557 for (j=0; j<nodemap->num; j++) {
558 /* we dont need to push to ourselves */
559 if (nodemap->nodes[j].pnn == pnn) {
562 /* dont push to nodes that are unavailable */
563 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
566 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), pnn, nodemap->nodes[j].pnn,
567 dbmap->dbids[i], CTDB_LMASTER_ANY, mem_ctx);
569 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
570 pnn, nodemap->nodes[j].pnn));
581 ensure all nodes have the same vnnmap we do
583 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
584 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
588 /* push the new vnn map out to all the nodes */
589 for (j=0; j<nodemap->num; j++) {
590 /* dont push to nodes that are unavailable */
591 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
595 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
597 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
607 handler for when the admin bans a node
609 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
610 TDB_DATA data, void *private_data)
612 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
613 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
614 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
618 if (data.dsize != sizeof(*b)) {
619 DEBUG(0,("Bad data in ban_handler\n"));
620 talloc_free(mem_ctx);
624 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
626 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
627 talloc_free(mem_ctx);
631 if (recmaster != ctdb->pnn) {
632 DEBUG(0,("We are not the recmaster - ignoring ban request\n"));
633 talloc_free(mem_ctx);
637 DEBUG(0,("Node %u has been banned for %u seconds by the administrator\n",
638 b->pnn, b->ban_time));
639 ctdb_ban_node(rec, b->pnn, b->ban_time);
640 talloc_free(mem_ctx);
644 handler for when the admin unbans a node
646 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
647 TDB_DATA data, void *private_data)
649 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
650 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
655 if (data.dsize != sizeof(uint32_t)) {
656 DEBUG(0,("Bad data in unban_handler\n"));
657 talloc_free(mem_ctx);
660 pnn = *(uint32_t *)data.dptr;
662 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
664 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
665 talloc_free(mem_ctx);
669 if (recmaster != ctdb->pnn) {
670 DEBUG(0,("We are not the recmaster - ignoring unban request\n"));
671 talloc_free(mem_ctx);
675 DEBUG(0,("Node %u has been unbanned by the administrator\n", pnn));
676 ctdb_unban_node(rec, pnn);
677 talloc_free(mem_ctx);
683 called when ctdb_wait_timeout should finish
685 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
686 struct timeval yt, void *p)
688 uint32_t *timed_out = (uint32_t *)p;
693 wait for a given number of seconds
695 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
697 uint32_t timed_out = 0;
698 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
700 event_loop_once(ctdb->ev);
704 /* Create a new random generation ip.
705 The generation id can not be the INVALID_GENERATION id
707 static uint32_t new_generation(void)
712 generation = random();
714 if (generation != INVALID_GENERATION) {
723 we are the recmaster, and recovery is needed - start a recovery run
725 static int do_recovery(struct ctdb_recoverd *rec,
726 TALLOC_CTX *mem_ctx, uint32_t pnn, uint32_t num_active,
727 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
730 struct ctdb_context *ctdb = rec->ctdb;
733 struct ctdb_dbid_map *dbmap;
735 /* if recovery fails, force it again */
736 rec->need_recovery = true;
738 if (rec->last_culprit != culprit ||
739 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
740 /* either a new node is the culprit, or we've decide to forgive them */
741 rec->last_culprit = culprit;
742 rec->first_recover_time = timeval_current();
743 rec->culprit_counter = 0;
745 rec->culprit_counter++;
747 if (rec->culprit_counter > 2*nodemap->num) {
748 DEBUG(0,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
749 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
750 ctdb->tunable.recovery_ban_period));
751 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
754 if (!ctdb_recovery_lock(ctdb, true)) {
755 DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
759 /* set recovery mode to active on all nodes */
760 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
762 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
766 DEBUG(0, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
768 /* pick a new generation number */
769 generation = new_generation();
771 /* change the vnnmap on this node to use the new generation
772 number but not on any other nodes.
773 this guarantees that if we abort the recovery prematurely
774 for some reason (a node stops responding?)
775 that we can just return immediately and we will reenter
776 recovery shortly again.
777 I.e. we deliberately leave the cluster with an inconsistent
778 generation id to allow us to abort recovery at any stage and
779 just restart it from scratch.
781 vnnmap->generation = generation;
782 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
784 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
788 /* get a list of all databases */
789 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
791 DEBUG(0, (__location__ " Unable to get dbids from node :%u\n", pnn));
797 /* verify that all other nodes have all our databases */
798 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
800 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
804 /* verify that we have all the databases any other node has */
805 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
807 DEBUG(0, (__location__ " Unable to create missing local databases\n"));
813 /* verify that all other nodes have all our databases */
814 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
816 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
821 DEBUG(1, (__location__ " Recovery - created remote databases\n"));
823 /* pull all remote databases onto the local node */
824 ret = pull_all_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
826 DEBUG(0, (__location__ " Unable to pull remote databases\n"));
830 DEBUG(1, (__location__ " Recovery - pulled remote databases\n"));
832 /* push all local databases to the remote nodes */
833 ret = push_all_local_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
835 DEBUG(0, (__location__ " Unable to push local databases\n"));
839 DEBUG(1, (__location__ " Recovery - pushed remote databases\n"));
841 /* build a new vnn map with all the currently active and
843 generation = new_generation();
844 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
845 CTDB_NO_MEMORY(ctdb, vnnmap);
846 vnnmap->generation = generation;
847 vnnmap->size = num_active;
848 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
849 for (i=j=0;i<nodemap->num;i++) {
850 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
851 vnnmap->map[j++] = nodemap->nodes[i].pnn;
857 /* update to the new vnnmap on all nodes */
858 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
860 DEBUG(0, (__location__ " Unable to update vnnmap on all nodes\n"));
864 DEBUG(1, (__location__ " Recovery - updated vnnmap\n"));
866 /* update recmaster to point to us for all nodes */
867 ret = set_recovery_master(ctdb, nodemap, pnn);
869 DEBUG(0, (__location__ " Unable to set recovery master\n"));
873 DEBUG(1, (__location__ " Recovery - updated recmaster\n"));
875 /* repoint all local and remote database records to the local
876 node as being dmaster
878 ret = update_dmaster_on_all_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
880 DEBUG(0, (__location__ " Unable to update dmaster on all databases\n"));
884 DEBUG(1, (__location__ " Recovery - updated dmaster on all databases\n"));
887 update all nodes to have the same flags that we have
889 ret = update_flags_on_all_nodes(ctdb, nodemap);
891 DEBUG(0, (__location__ " Unable to update flags on all nodes\n"));
895 DEBUG(1, (__location__ " Recovery - updated flags\n"));
898 run a vacuum operation on empty records
900 ret = vacuum_all_databases(ctdb, nodemap, dbmap);
902 DEBUG(0, (__location__ " Unable to vacuum all databases\n"));
906 DEBUG(1, (__location__ " Recovery - vacuumed all databases\n"));
909 if enabled, tell nodes to takeover their public IPs
912 rec->need_takeover_run = false;
913 ret = ctdb_takeover_run(ctdb, nodemap);
915 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
918 DEBUG(1, (__location__ " Recovery - done takeover\n"));
922 /* disable recovery mode */
923 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
925 DEBUG(0, (__location__ " Unable to set recovery mode to normal on cluster\n"));
929 /* send a message to all clients telling them that the cluster
930 has been reconfigured */
931 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
933 DEBUG(0, (__location__ " Recovery complete\n"));
935 rec->need_recovery = false;
937 /* We just finished a recovery successfully.
938 We now wait for rerecovery_timeout before we allow
939 another recovery to take place.
941 DEBUG(0, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
942 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
943 DEBUG(0, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
950 elections are won by first checking the number of connected nodes, then
951 the priority time, then the pnn
953 struct election_message {
954 uint32_t num_connected;
955 struct timeval priority_time;
960 form this nodes election data
962 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
965 struct ctdb_node_map *nodemap;
966 struct ctdb_context *ctdb = rec->ctdb;
970 em->pnn = rec->ctdb->pnn;
971 em->priority_time = rec->priority_time;
973 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
978 for (i=0;i<nodemap->num;i++) {
979 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
983 talloc_free(nodemap);
987 see if the given election data wins
989 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
991 struct election_message myem;
994 ctdb_election_data(rec, &myem);
996 /* try to use the most connected node */
997 cmp = (int)myem.num_connected - (int)em->num_connected;
999 /* then the longest running node */
1001 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1005 cmp = (int)myem.pnn - (int)em->pnn;
1012 send out an election request
1014 static int send_election_request(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn)
1017 TDB_DATA election_data;
1018 struct election_message emsg;
1020 struct ctdb_context *ctdb = rec->ctdb;
1022 srvid = CTDB_SRVID_RECOVERY;
1024 ctdb_election_data(rec, &emsg);
1026 election_data.dsize = sizeof(struct election_message);
1027 election_data.dptr = (unsigned char *)&emsg;
1030 /* first we assume we will win the election and set
1031 recoverymaster to be ourself on the current node
1033 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1035 DEBUG(0, (__location__ " failed to send recmaster election request\n"));
1040 /* send an election message to all active nodes */
1041 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1047 this function will unban all nodes in the cluster
1049 static void unban_all_nodes(struct ctdb_context *ctdb)
1052 struct ctdb_node_map *nodemap;
1053 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1055 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1057 DEBUG(0,(__location__ " failed to get nodemap to unban all nodes\n"));
1061 for (i=0;i<nodemap->num;i++) {
1062 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1063 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1064 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1068 talloc_free(tmp_ctx);
1072 handler for recovery master elections
1074 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1075 TDB_DATA data, void *private_data)
1077 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1079 struct election_message *em = (struct election_message *)data.dptr;
1080 TALLOC_CTX *mem_ctx;
1082 mem_ctx = talloc_new(ctdb);
1084 /* someone called an election. check their election data
1085 and if we disagree and we would rather be the elected node,
1086 send a new election message to all other nodes
1088 if (ctdb_election_win(rec, em)) {
1089 ret = send_election_request(rec, mem_ctx, ctdb_get_pnn(ctdb));
1091 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1093 talloc_free(mem_ctx);
1094 /*unban_all_nodes(ctdb);*/
1098 /* release the recmaster lock */
1099 if (em->pnn != ctdb->pnn &&
1100 ctdb->recovery_lock_fd != -1) {
1101 close(ctdb->recovery_lock_fd);
1102 ctdb->recovery_lock_fd = -1;
1103 unban_all_nodes(ctdb);
1106 /* ok, let that guy become recmaster then */
1107 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1109 DEBUG(0, (__location__ " failed to send recmaster election request"));
1110 talloc_free(mem_ctx);
1114 /* release any bans */
1115 rec->last_culprit = (uint32_t)-1;
1116 talloc_free(rec->banned_nodes);
1117 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1118 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1120 talloc_free(mem_ctx);
1126 force the start of the election process
1128 static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn,
1129 struct ctdb_node_map *nodemap)
1132 struct ctdb_context *ctdb = rec->ctdb;
1134 /* set all nodes to recovery mode to stop all internode traffic */
1135 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1137 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1141 ret = send_election_request(rec, mem_ctx, pnn);
1143 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1147 /* wait for a few seconds to collect all responses */
1148 ctdb_wait_timeout(ctdb, ctdb->tunable.election_timeout);
1154 handler for when a node changes its flags
1156 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1157 TDB_DATA data, void *private_data)
1160 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1161 struct ctdb_node_map *nodemap=NULL;
1162 TALLOC_CTX *tmp_ctx;
1163 uint32_t changed_flags;
1165 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1167 if (data.dsize != sizeof(*c)) {
1168 DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1172 tmp_ctx = talloc_new(ctdb);
1173 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1175 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1177 for (i=0;i<nodemap->num;i++) {
1178 if (nodemap->nodes[i].pnn == c->pnn) break;
1181 if (i == nodemap->num) {
1182 DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1183 talloc_free(tmp_ctx);
1187 changed_flags = c->old_flags ^ c->new_flags;
1189 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1190 This flag is handled locally based on whether the local node
1191 can communicate with the node or not.
1193 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1194 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1195 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1198 if (nodemap->nodes[i].flags != c->new_flags) {
1199 DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1202 nodemap->nodes[i].flags = c->new_flags;
1204 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1205 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1208 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1209 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1213 ctdb->recovery_master == ctdb->pnn &&
1214 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
1216 /* Only do the takeover run if the perm disabled or unhealthy
1217 flags changed since these will cause an ip failover but not
1219 If the node became disconnected or banned this will also
1220 lead to an ip address failover but that is handled
1223 if (changed_flags & NODE_FLAGS_DISABLED) {
1224 rec->need_takeover_run = true;
1228 talloc_free(tmp_ctx);
1233 struct verify_recmode_normal_data {
1235 enum monitor_result status;
1238 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1240 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private, struct verify_recmode_normal_data);
1243 /* one more node has responded with recmode data*/
1246 /* if we failed to get the recmode, then return an error and let
1247 the main loop try again.
1249 if (state->state != CTDB_CONTROL_DONE) {
1250 if (rmdata->status == MONITOR_OK) {
1251 rmdata->status = MONITOR_FAILED;
1256 /* if we got a response, then the recmode will be stored in the
1259 if (state->status != CTDB_RECOVERY_NORMAL) {
1260 DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1261 rmdata->status = MONITOR_RECOVERY_NEEDED;
1268 /* verify that all nodes are in normal recovery mode */
1269 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1271 struct verify_recmode_normal_data *rmdata;
1272 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1273 struct ctdb_client_control_state *state;
1274 enum monitor_result status;
1277 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1278 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1280 rmdata->status = MONITOR_OK;
1282 /* loop over all active nodes and send an async getrecmode call to
1284 for (j=0; j<nodemap->num; j++) {
1285 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1288 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1290 nodemap->nodes[j].pnn);
1291 if (state == NULL) {
1292 /* we failed to send the control, treat this as
1293 an error and try again next iteration
1295 DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1296 talloc_free(mem_ctx);
1297 return MONITOR_FAILED;
1300 /* set up the callback functions */
1301 state->async.fn = verify_recmode_normal_callback;
1302 state->async.private = rmdata;
1304 /* one more control to wait for to complete */
1309 /* now wait for up to the maximum number of seconds allowed
1310 or until all nodes we expect a response from has replied
1312 while (rmdata->count > 0) {
1313 event_loop_once(ctdb->ev);
1316 status = rmdata->status;
1317 talloc_free(mem_ctx);
1322 struct verify_recmaster_data {
1325 enum monitor_result status;
1328 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1330 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private, struct verify_recmaster_data);
1333 /* one more node has responded with recmaster data*/
1336 /* if we failed to get the recmaster, then return an error and let
1337 the main loop try again.
1339 if (state->state != CTDB_CONTROL_DONE) {
1340 if (rmdata->status == MONITOR_OK) {
1341 rmdata->status = MONITOR_FAILED;
1346 /* if we got a response, then the recmaster will be stored in the
1349 if (state->status != rmdata->pnn) {
1350 DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
1351 rmdata->status = MONITOR_ELECTION_NEEDED;
1358 /* verify that all nodes agree that we are the recmaster */
1359 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
1361 struct verify_recmaster_data *rmdata;
1362 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1363 struct ctdb_client_control_state *state;
1364 enum monitor_result status;
1367 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1368 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1371 rmdata->status = MONITOR_OK;
1373 /* loop over all active nodes and send an async getrecmaster call to
1375 for (j=0; j<nodemap->num; j++) {
1376 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1379 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1381 nodemap->nodes[j].pnn);
1382 if (state == NULL) {
1383 /* we failed to send the control, treat this as
1384 an error and try again next iteration
1386 DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1387 talloc_free(mem_ctx);
1388 return MONITOR_FAILED;
1391 /* set up the callback functions */
1392 state->async.fn = verify_recmaster_callback;
1393 state->async.private = rmdata;
1395 /* one more control to wait for to complete */
1400 /* now wait for up to the maximum number of seconds allowed
1401 or until all nodes we expect a response from has replied
1403 while (rmdata->count > 0) {
1404 event_loop_once(ctdb->ev);
1407 status = rmdata->status;
1408 talloc_free(mem_ctx);
1414 the main monitoring loop
1416 static void monitor_cluster(struct ctdb_context *ctdb)
1418 uint32_t pnn, num_active, recmaster;
1419 TALLOC_CTX *mem_ctx=NULL;
1420 struct ctdb_node_map *nodemap=NULL;
1421 struct ctdb_node_map *remote_nodemap=NULL;
1422 struct ctdb_vnn_map *vnnmap=NULL;
1423 struct ctdb_vnn_map *remote_vnnmap=NULL;
1425 struct ctdb_recoverd *rec;
1427 rec = talloc_zero(ctdb, struct ctdb_recoverd);
1428 CTDB_NO_MEMORY_FATAL(ctdb, rec);
1431 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1432 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1434 rec->priority_time = timeval_current();
1436 /* register a message port for recovery elections */
1437 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
1439 /* and one for when nodes are disabled/enabled */
1440 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
1442 /* and one for when nodes are banned */
1443 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
1445 /* and one for when nodes are unbanned */
1446 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
1450 talloc_free(mem_ctx);
1453 mem_ctx = talloc_new(ctdb);
1455 DEBUG(0,("Failed to create temporary context\n"));
1459 /* we only check for recovery once every second */
1460 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
1462 /* get relevant tunables */
1463 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
1465 DEBUG(0,("Failed to get tunables - retrying\n"));
1469 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1470 if (pnn == (uint32_t)-1) {
1471 DEBUG(0,("Failed to get local pnn - retrying\n"));
1475 /* get the vnnmap */
1476 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
1478 DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", pnn));
1483 /* get number of nodes */
1484 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &nodemap);
1486 DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", pnn));
1491 /* count how many active nodes there are */
1493 for (i=0; i<nodemap->num; i++) {
1494 if (rec->banned_nodes[nodemap->nodes[i].pnn] != NULL) {
1495 nodemap->nodes[i].flags |= NODE_FLAGS_BANNED;
1497 nodemap->nodes[i].flags &= ~NODE_FLAGS_BANNED;
1499 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1505 /* check which node is the recovery master */
1506 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &recmaster);
1508 DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", pnn));
1512 if (recmaster == (uint32_t)-1) {
1513 DEBUG(0,(__location__ " Initial recovery master set - forcing election\n"));
1514 force_election(rec, mem_ctx, pnn, nodemap);
1518 /* verify that the recmaster node is still active */
1519 for (j=0; j<nodemap->num; j++) {
1520 if (nodemap->nodes[j].pnn==recmaster) {
1525 if (j == nodemap->num) {
1526 DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster));
1527 force_election(rec, mem_ctx, pnn, nodemap);
1531 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1532 DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
1533 force_election(rec, mem_ctx, pnn, nodemap);
1538 /* if we are not the recmaster then we do not need to check
1539 if recovery is needed
1541 if (pnn != recmaster) {
1546 /* update the list of public ips that a node can handle for
1549 for (j=0; j<nodemap->num; j++) {
1550 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1553 /* release any existing data */
1554 if (ctdb->nodes[j]->public_ips) {
1555 talloc_free(ctdb->nodes[j]->public_ips);
1556 ctdb->nodes[j]->public_ips = NULL;
1558 /* grab a new shiny list of public ips from the node */
1559 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
1560 ctdb->nodes[j]->pnn,
1562 &ctdb->nodes[j]->public_ips)) {
1563 DEBUG(0,("Failed to read public ips from node : %u\n",
1564 ctdb->nodes[j]->pnn));
1570 /* verify that all active nodes agree that we are the recmaster */
1571 switch (verify_recmaster(ctdb, nodemap, pnn)) {
1572 case MONITOR_RECOVERY_NEEDED:
1573 /* can not happen */
1575 case MONITOR_ELECTION_NEEDED:
1576 force_election(rec, mem_ctx, pnn, nodemap);
1580 case MONITOR_FAILED:
1585 if (rec->need_recovery) {
1586 /* a previous recovery didn't finish */
1587 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1591 /* verify that all active nodes are in normal mode
1592 and not in recovery mode
1594 switch (verify_recmode(ctdb, nodemap)) {
1595 case MONITOR_RECOVERY_NEEDED:
1596 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1598 case MONITOR_FAILED:
1600 case MONITOR_ELECTION_NEEDED:
1601 /* can not happen */
1608 /* get the nodemap for all active remote nodes and verify
1609 they are the same as for this node
1611 for (j=0; j<nodemap->num; j++) {
1612 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1615 if (nodemap->nodes[j].pnn == pnn) {
1619 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1620 mem_ctx, &remote_nodemap);
1622 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
1623 nodemap->nodes[j].pnn));
1627 /* if the nodes disagree on how many nodes there are
1628 then this is a good reason to try recovery
1630 if (remote_nodemap->num != nodemap->num) {
1631 DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
1632 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
1633 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1637 /* if the nodes disagree on which nodes exist and are
1638 active, then that is also a good reason to do recovery
1640 for (i=0;i<nodemap->num;i++) {
1641 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
1642 DEBUG(0, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
1643 nodemap->nodes[j].pnn, i,
1644 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
1645 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1646 vnnmap, nodemap->nodes[j].pnn);
1649 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
1650 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1651 DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
1652 nodemap->nodes[j].pnn, i,
1653 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
1654 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1655 vnnmap, nodemap->nodes[j].pnn);
1660 /* update our nodemap flags according to the other
1661 server - this gets the NODE_FLAGS_DISABLED
1662 flag. Note that the remote node is authoritative
1663 for its flags (except CONNECTED, which we know
1664 matches in this code) */
1665 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1666 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1667 rec->need_takeover_run = true;
1672 /* there better be the same number of lmasters in the vnn map
1673 as there are active nodes or we will have to do a recovery
1675 if (vnnmap->size != num_active) {
1676 DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
1677 vnnmap->size, num_active));
1678 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
1682 /* verify that all active nodes in the nodemap also exist in
1685 for (j=0; j<nodemap->num; j++) {
1686 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1689 if (nodemap->nodes[j].pnn == pnn) {
1693 for (i=0; i<vnnmap->size; i++) {
1694 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
1698 if (i == vnnmap->size) {
1699 DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
1700 nodemap->nodes[j].pnn));
1701 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1707 /* verify that all other nodes have the same vnnmap
1708 and are from the same generation
1710 for (j=0; j<nodemap->num; j++) {
1711 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1714 if (nodemap->nodes[j].pnn == pnn) {
1718 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1719 mem_ctx, &remote_vnnmap);
1721 DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n",
1722 nodemap->nodes[j].pnn));
1726 /* verify the vnnmap generation is the same */
1727 if (vnnmap->generation != remote_vnnmap->generation) {
1728 DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
1729 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
1730 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1734 /* verify the vnnmap size is the same */
1735 if (vnnmap->size != remote_vnnmap->size) {
1736 DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
1737 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
1738 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1742 /* verify the vnnmap is the same */
1743 for (i=0;i<vnnmap->size;i++) {
1744 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
1745 DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n",
1746 nodemap->nodes[j].pnn));
1747 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1748 vnnmap, nodemap->nodes[j].pnn);
1754 /* we might need to change who has what IP assigned */
1755 if (rec->need_takeover_run) {
1756 rec->need_takeover_run = false;
1757 ret = ctdb_takeover_run(ctdb, nodemap);
1759 DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
1760 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1761 vnnmap, nodemap->nodes[j].pnn);
1770 event handler for when the main ctdbd dies
1772 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
1773 uint16_t flags, void *private_data)
1775 DEBUG(0,("recovery daemon parent died - exiting\n"));
1782 startup the recovery daemon as a child of the main ctdb daemon
1784 int ctdb_start_recoverd(struct ctdb_context *ctdb)
1790 if (pipe(fd) != 0) {
1806 /* shutdown the transport */
1807 ctdb->methods->shutdown(ctdb);
1809 /* get a new event context */
1810 talloc_free(ctdb->ev);
1811 ctdb->ev = event_context_init(ctdb);
1813 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
1814 ctdb_recoverd_parent, &fd[0]);
1816 close(ctdb->daemon.sd);
1817 ctdb->daemon.sd = -1;
1819 srandom(getpid() ^ time(NULL));
1821 /* initialise ctdb */
1822 ret = ctdb_socket_connect(ctdb);
1824 DEBUG(0, (__location__ " Failed to init ctdb\n"));
1828 monitor_cluster(ctdb);
1830 DEBUG(0,("ERROR: ctdb_recoverd finished!?\n"));