4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
27 #include "../include/ctdb.h"
28 #include "../include/ctdb_private.h"
32 struct ctdb_recoverd *rec;
37 private state of recovery daemon
39 struct ctdb_recoverd {
40 struct ctdb_context *ctdb;
41 uint32_t last_culprit;
42 uint32_t culprit_counter;
43 struct timeval first_recover_time;
44 struct ban_state **banned_nodes;
45 struct timeval priority_time;
46 bool need_takeover_run;
50 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
51 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
56 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
58 struct ctdb_context *ctdb = rec->ctdb;
60 if (!ctdb_validate_pnn(ctdb, pnn)) {
61 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
65 if (rec->banned_nodes[pnn] == NULL) {
69 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
71 talloc_free(rec->banned_nodes[pnn]);
72 rec->banned_nodes[pnn] = NULL;
77 called when a ban has timed out
79 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
81 struct ban_state *state = talloc_get_type(p, struct ban_state);
82 struct ctdb_recoverd *rec = state->rec;
83 uint32_t pnn = state->banned_node;
85 DEBUG(0,("Node %u is now unbanned\n", pnn));
86 ctdb_unban_node(rec, pnn);
90 ban a node for a period of time
92 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
94 struct ctdb_context *ctdb = rec->ctdb;
96 if (!ctdb_validate_pnn(ctdb, pnn)) {
97 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
101 if (pnn == ctdb->pnn) {
102 DEBUG(0,("self ban - lowering our election priority\n"));
103 /* banning ourselves - lower our election priority */
104 rec->priority_time = timeval_current();
107 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
109 rec->banned_nodes[pnn] = talloc(rec, struct ban_state);
110 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
112 rec->banned_nodes[pnn]->rec = rec;
113 rec->banned_nodes[pnn]->banned_node = pnn;
116 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
117 timeval_current_ofs(ban_time, 0),
118 ctdb_ban_timeout, rec->banned_nodes[pnn]);
122 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
125 struct freeze_node_data {
127 enum monitor_result status;
131 static void freeze_node_callback(struct ctdb_client_control_state *state)
133 struct freeze_node_data *fndata = talloc_get_type(state->async.private, struct freeze_node_data);
136 /* one more node has responded to our freeze node*/
139 /* if we failed to freeze the node, we must trigger another recovery */
140 if ( (state->state != CTDB_CONTROL_DONE) || (state->status != 0) ) {
141 DEBUG(0, (__location__ " Failed to freeze node:%u. recovery failed\n", state->c->hdr.destnode));
142 fndata->status = MONITOR_RECOVERY_NEEDED;
150 /* freeze all nodes */
151 static enum monitor_result freeze_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
153 struct freeze_node_data *fndata;
154 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
155 struct ctdb_client_control_state *state;
156 enum monitor_result status;
159 fndata = talloc(mem_ctx, struct freeze_node_data);
160 CTDB_NO_MEMORY_FATAL(ctdb, fndata);
162 fndata->status = MONITOR_OK;
164 /* loop over all active nodes and send an async freeze call to
166 for (j=0; j<nodemap->num; j++) {
167 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
170 state = ctdb_ctrl_freeze_send(ctdb, mem_ctx,
172 nodemap->nodes[j].pnn);
174 /* we failed to send the control, treat this as
175 an error and try again next iteration
177 DEBUG(0,("Failed to call ctdb_ctrl_freeze_send during recovery\n"));
178 talloc_free(mem_ctx);
179 return MONITOR_RECOVERY_NEEDED;
182 /* set up the callback functions */
183 state->async.fn = freeze_node_callback;
184 state->async.private = fndata;
186 /* one more control to wait for to complete */
191 /* now wait for up to the maximum number of seconds allowed
192 or until all nodes we expect a response from has replied
194 while (fndata->count > 0) {
195 event_loop_once(ctdb->ev);
198 status = fndata->status;
199 talloc_free(mem_ctx);
205 change recovery mode on all nodes
207 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
211 /* freeze all nodes */
212 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
213 ret = freeze_all_nodes(ctdb, nodemap);
214 if (ret != MONITOR_OK) {
215 DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
221 /* set recovery mode to active on all nodes */
222 for (j=0; j<nodemap->num; j++) {
223 /* dont change it for nodes that are unavailable */
224 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
228 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, rec_mode);
230 DEBUG(0, (__location__ " Unable to set recmode on node %u\n", nodemap->nodes[j].pnn));
234 if (rec_mode == CTDB_RECOVERY_NORMAL) {
235 ret = ctdb_ctrl_thaw(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn);
237 DEBUG(0, (__location__ " Unable to thaw node %u\n", nodemap->nodes[j].pnn));
247 change recovery master on all node
249 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
253 /* set recovery master to pnn on all nodes */
254 for (j=0; j<nodemap->num; j++) {
255 /* dont change it for nodes that are unavailable */
256 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
260 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, pnn);
262 DEBUG(0, (__location__ " Unable to set recmaster on node %u\n", nodemap->nodes[j].pnn));
272 ensure all other nodes have attached to any databases that we have
274 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
275 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
278 struct ctdb_dbid_map *remote_dbmap;
280 /* verify that all other nodes have all our databases */
281 for (j=0; j<nodemap->num; j++) {
282 /* we dont need to ourself ourselves */
283 if (nodemap->nodes[j].pnn == pnn) {
286 /* dont check nodes that are unavailable */
287 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
291 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
292 mem_ctx, &remote_dbmap);
294 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
298 /* step through all local databases */
299 for (db=0; db<dbmap->num;db++) {
303 for (i=0;i<remote_dbmap->num;i++) {
304 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
308 /* the remote node already have this database */
309 if (i!=remote_dbmap->num) {
312 /* ok so we need to create this database */
313 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
316 DEBUG(0, (__location__ " Unable to get dbname from node %u\n", pnn));
319 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
320 mem_ctx, name, dbmap->dbs[db].persistent);
322 DEBUG(0, (__location__ " Unable to create remote db:%s\n", name));
333 ensure we are attached to any databases that anyone else is attached to
335 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
336 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
339 struct ctdb_dbid_map *remote_dbmap;
341 /* verify that we have all database any other node has */
342 for (j=0; j<nodemap->num; j++) {
343 /* we dont need to ourself ourselves */
344 if (nodemap->nodes[j].pnn == pnn) {
347 /* dont check nodes that are unavailable */
348 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
352 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
353 mem_ctx, &remote_dbmap);
355 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
359 /* step through all databases on the remote node */
360 for (db=0; db<remote_dbmap->num;db++) {
363 for (i=0;i<(*dbmap)->num;i++) {
364 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
368 /* we already have this db locally */
369 if (i!=(*dbmap)->num) {
372 /* ok so we need to create this database and
375 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
376 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
378 DEBUG(0, (__location__ " Unable to get dbname from node %u\n",
379 nodemap->nodes[j].pnn));
382 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
383 remote_dbmap->dbs[db].persistent);
385 DEBUG(0, (__location__ " Unable to create local db:%s\n", name));
388 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
390 DEBUG(0, (__location__ " Unable to reread dbmap on node %u\n", pnn));
401 pull all the remote database contents into ours
403 static int pull_all_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
404 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
408 /* pull all records from all other nodes across onto this node
409 (this merges based on rsn)
411 for (i=0;i<dbmap->num;i++) {
412 for (j=0; j<nodemap->num; j++) {
413 /* we dont need to merge with ourselves */
414 if (nodemap->nodes[j].pnn == pnn) {
417 /* dont merge from nodes that are unavailable */
418 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
421 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
422 pnn, dbmap->dbs[i].dbid, CTDB_LMASTER_ANY, mem_ctx);
424 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
425 nodemap->nodes[j].pnn, pnn));
436 change the dmaster on all databases to point to us
438 static int update_dmaster_on_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
439 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
443 /* update dmaster to point to this node for all databases/nodes */
444 for (i=0;i<dbmap->num;i++) {
445 for (j=0; j<nodemap->num; j++) {
446 /* dont repoint nodes that are unavailable */
447 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
450 ret = ctdb_ctrl_setdmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
451 ctdb, dbmap->dbs[i].dbid, pnn);
453 DEBUG(0, (__location__ " Unable to set dmaster for node %u db:0x%08x\n",
454 nodemap->nodes[j].pnn, dbmap->dbs[i].dbid));
465 update flags on all active nodes
467 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
470 for (i=0;i<nodemap->num;i++) {
471 struct ctdb_node_flag_change c;
474 c.pnn = nodemap->nodes[i].pnn;
475 c.old_flags = nodemap->nodes[i].flags;
476 c.new_flags = nodemap->nodes[i].flags;
478 data.dptr = (uint8_t *)&c;
479 data.dsize = sizeof(c);
481 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
482 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
491 static int vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *nodemap)
496 /* find max rsn on our local node for this db */
497 ret = ctdb_ctrl_get_max_rsn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, db_id, &max_rsn);
502 /* set rsn on non-empty records to max_rsn+1 */
503 for (i=0;i<nodemap->num;i++) {
504 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
507 ret = ctdb_ctrl_set_rsn_nonempty(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
510 DEBUG(0,(__location__ " Failed to set rsn on node %u to %llu\n",
511 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
516 /* delete records with rsn < max_rsn+1 on all nodes */
517 for (i=0;i<nodemap->num;i++) {
518 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
521 ret = ctdb_ctrl_delete_low_rsn(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
524 DEBUG(0,(__location__ " Failed to delete records on node %u with rsn below %llu\n",
525 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
536 vacuum all attached databases
538 static int vacuum_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
539 struct ctdb_dbid_map *dbmap)
543 /* update dmaster to point to this node for all databases/nodes */
544 for (i=0;i<dbmap->num;i++) {
545 if (vacuum_db(ctdb, dbmap->dbs[i].dbid, nodemap) != 0) {
554 push out all our database contents to all other nodes
556 static int push_all_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
557 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
561 /* push all records out to the nodes again */
562 for (i=0;i<dbmap->num;i++) {
563 for (j=0; j<nodemap->num; j++) {
564 /* we dont need to push to ourselves */
565 if (nodemap->nodes[j].pnn == pnn) {
568 /* dont push to nodes that are unavailable */
569 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
572 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), pnn, nodemap->nodes[j].pnn,
573 dbmap->dbs[i].dbid, CTDB_LMASTER_ANY, mem_ctx);
575 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
576 pnn, nodemap->nodes[j].pnn));
587 ensure all nodes have the same vnnmap we do
589 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
590 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
594 /* push the new vnn map out to all the nodes */
595 for (j=0; j<nodemap->num; j++) {
596 /* dont push to nodes that are unavailable */
597 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
601 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
603 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
613 handler for when the admin bans a node
615 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
616 TDB_DATA data, void *private_data)
618 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
619 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
620 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
624 if (data.dsize != sizeof(*b)) {
625 DEBUG(0,("Bad data in ban_handler\n"));
626 talloc_free(mem_ctx);
630 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
632 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
633 talloc_free(mem_ctx);
637 if (recmaster != ctdb->pnn) {
638 DEBUG(0,("We are not the recmaster - ignoring ban request\n"));
639 talloc_free(mem_ctx);
643 DEBUG(0,("Node %u has been banned for %u seconds by the administrator\n",
644 b->pnn, b->ban_time));
645 ctdb_ban_node(rec, b->pnn, b->ban_time);
646 talloc_free(mem_ctx);
650 handler for when the admin unbans a node
652 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
653 TDB_DATA data, void *private_data)
655 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
656 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
661 if (data.dsize != sizeof(uint32_t)) {
662 DEBUG(0,("Bad data in unban_handler\n"));
663 talloc_free(mem_ctx);
666 pnn = *(uint32_t *)data.dptr;
668 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
670 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
671 talloc_free(mem_ctx);
675 if (recmaster != ctdb->pnn) {
676 DEBUG(0,("We are not the recmaster - ignoring unban request\n"));
677 talloc_free(mem_ctx);
681 DEBUG(0,("Node %u has been unbanned by the administrator\n", pnn));
682 ctdb_unban_node(rec, pnn);
683 talloc_free(mem_ctx);
689 called when ctdb_wait_timeout should finish
691 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
692 struct timeval yt, void *p)
694 uint32_t *timed_out = (uint32_t *)p;
699 wait for a given number of seconds
701 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
703 uint32_t timed_out = 0;
704 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
706 event_loop_once(ctdb->ev);
710 /* Create a new random generation ip.
711 The generation id can not be the INVALID_GENERATION id
713 static uint32_t new_generation(void)
718 generation = random();
720 if (generation != INVALID_GENERATION) {
729 remember the trouble maker
731 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
733 struct ctdb_context *ctdb = rec->ctdb;
735 if (rec->last_culprit != culprit ||
736 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
737 /* either a new node is the culprit, or we've decide to forgive them */
738 rec->last_culprit = culprit;
739 rec->first_recover_time = timeval_current();
740 rec->culprit_counter = 0;
742 rec->culprit_counter++;
746 we are the recmaster, and recovery is needed - start a recovery run
748 static int do_recovery(struct ctdb_recoverd *rec,
749 TALLOC_CTX *mem_ctx, uint32_t pnn, uint32_t num_active,
750 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
753 struct ctdb_context *ctdb = rec->ctdb;
756 struct ctdb_dbid_map *dbmap;
758 /* if recovery fails, force it again */
759 rec->need_recovery = true;
761 ctdb_set_culprit(rec, culprit);
763 if (rec->culprit_counter > 2*nodemap->num) {
764 DEBUG(0,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
765 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
766 ctdb->tunable.recovery_ban_period));
767 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
770 if (!ctdb_recovery_lock(ctdb, true)) {
771 ctdb_set_culprit(rec, pnn);
772 DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
776 /* set recovery mode to active on all nodes */
777 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
779 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
783 DEBUG(0, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
785 /* pick a new generation number */
786 generation = new_generation();
788 /* change the vnnmap on this node to use the new generation
789 number but not on any other nodes.
790 this guarantees that if we abort the recovery prematurely
791 for some reason (a node stops responding?)
792 that we can just return immediately and we will reenter
793 recovery shortly again.
794 I.e. we deliberately leave the cluster with an inconsistent
795 generation id to allow us to abort recovery at any stage and
796 just restart it from scratch.
798 vnnmap->generation = generation;
799 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
801 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
805 /* get a list of all databases */
806 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
808 DEBUG(0, (__location__ " Unable to get dbids from node :%u\n", pnn));
814 /* verify that all other nodes have all our databases */
815 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
817 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
821 /* verify that we have all the databases any other node has */
822 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
824 DEBUG(0, (__location__ " Unable to create missing local databases\n"));
830 /* verify that all other nodes have all our databases */
831 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
833 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
838 DEBUG(1, (__location__ " Recovery - created remote databases\n"));
840 /* pull all remote databases onto the local node */
841 ret = pull_all_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
843 DEBUG(0, (__location__ " Unable to pull remote databases\n"));
847 DEBUG(1, (__location__ " Recovery - pulled remote databases\n"));
849 /* push all local databases to the remote nodes */
850 ret = push_all_local_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
852 DEBUG(0, (__location__ " Unable to push local databases\n"));
856 DEBUG(1, (__location__ " Recovery - pushed remote databases\n"));
858 /* build a new vnn map with all the currently active and
860 generation = new_generation();
861 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
862 CTDB_NO_MEMORY(ctdb, vnnmap);
863 vnnmap->generation = generation;
864 vnnmap->size = num_active;
865 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
866 for (i=j=0;i<nodemap->num;i++) {
867 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
868 vnnmap->map[j++] = nodemap->nodes[i].pnn;
874 /* update to the new vnnmap on all nodes */
875 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
877 DEBUG(0, (__location__ " Unable to update vnnmap on all nodes\n"));
881 DEBUG(1, (__location__ " Recovery - updated vnnmap\n"));
883 /* update recmaster to point to us for all nodes */
884 ret = set_recovery_master(ctdb, nodemap, pnn);
886 DEBUG(0, (__location__ " Unable to set recovery master\n"));
890 DEBUG(1, (__location__ " Recovery - updated recmaster\n"));
892 /* repoint all local and remote database records to the local
893 node as being dmaster
895 ret = update_dmaster_on_all_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
897 DEBUG(0, (__location__ " Unable to update dmaster on all databases\n"));
901 DEBUG(1, (__location__ " Recovery - updated dmaster on all databases\n"));
904 update all nodes to have the same flags that we have
906 ret = update_flags_on_all_nodes(ctdb, nodemap);
908 DEBUG(0, (__location__ " Unable to update flags on all nodes\n"));
912 DEBUG(1, (__location__ " Recovery - updated flags\n"));
915 run a vacuum operation on empty records
917 ret = vacuum_all_databases(ctdb, nodemap, dbmap);
919 DEBUG(0, (__location__ " Unable to vacuum all databases\n"));
923 DEBUG(1, (__location__ " Recovery - vacuumed all databases\n"));
926 if enabled, tell nodes to takeover their public IPs
929 rec->need_takeover_run = false;
930 ret = ctdb_takeover_run(ctdb, nodemap);
932 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
935 DEBUG(1, (__location__ " Recovery - done takeover\n"));
938 for (i=0;i<dbmap->num;i++) {
939 DEBUG(0,("Recovered database with db_id 0x%08x\n", dbmap->dbs[i].dbid));
942 /* disable recovery mode */
943 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
945 DEBUG(0, (__location__ " Unable to set recovery mode to normal on cluster\n"));
949 /* send a message to all clients telling them that the cluster
950 has been reconfigured */
951 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
953 DEBUG(0, (__location__ " Recovery complete\n"));
955 rec->need_recovery = false;
957 /* We just finished a recovery successfully.
958 We now wait for rerecovery_timeout before we allow
959 another recovery to take place.
961 DEBUG(0, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
962 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
963 DEBUG(0, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
970 elections are won by first checking the number of connected nodes, then
971 the priority time, then the pnn
973 struct election_message {
974 uint32_t num_connected;
975 struct timeval priority_time;
980 form this nodes election data
982 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
985 struct ctdb_node_map *nodemap;
986 struct ctdb_context *ctdb = rec->ctdb;
990 em->pnn = rec->ctdb->pnn;
991 em->priority_time = rec->priority_time;
993 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
998 for (i=0;i<nodemap->num;i++) {
999 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1000 em->num_connected++;
1003 talloc_free(nodemap);
1007 see if the given election data wins
1009 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1011 struct election_message myem;
1014 ctdb_election_data(rec, &myem);
1016 /* try to use the most connected node */
1017 cmp = (int)myem.num_connected - (int)em->num_connected;
1019 /* then the longest running node */
1021 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1025 cmp = (int)myem.pnn - (int)em->pnn;
1032 send out an election request
1034 static int send_election_request(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn)
1037 TDB_DATA election_data;
1038 struct election_message emsg;
1040 struct ctdb_context *ctdb = rec->ctdb;
1042 srvid = CTDB_SRVID_RECOVERY;
1044 ctdb_election_data(rec, &emsg);
1046 election_data.dsize = sizeof(struct election_message);
1047 election_data.dptr = (unsigned char *)&emsg;
1050 /* first we assume we will win the election and set
1051 recoverymaster to be ourself on the current node
1053 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1055 DEBUG(0, (__location__ " failed to send recmaster election request\n"));
1060 /* send an election message to all active nodes */
1061 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1067 this function will unban all nodes in the cluster
1069 static void unban_all_nodes(struct ctdb_context *ctdb)
1072 struct ctdb_node_map *nodemap;
1073 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1075 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1077 DEBUG(0,(__location__ " failed to get nodemap to unban all nodes\n"));
1081 for (i=0;i<nodemap->num;i++) {
1082 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1083 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1084 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1088 talloc_free(tmp_ctx);
1092 handler for recovery master elections
1094 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1095 TDB_DATA data, void *private_data)
1097 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1099 struct election_message *em = (struct election_message *)data.dptr;
1100 TALLOC_CTX *mem_ctx;
1102 mem_ctx = talloc_new(ctdb);
1104 /* someone called an election. check their election data
1105 and if we disagree and we would rather be the elected node,
1106 send a new election message to all other nodes
1108 if (ctdb_election_win(rec, em)) {
1109 ret = send_election_request(rec, mem_ctx, ctdb_get_pnn(ctdb));
1111 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1113 talloc_free(mem_ctx);
1114 /*unban_all_nodes(ctdb);*/
1118 /* release the recmaster lock */
1119 if (em->pnn != ctdb->pnn &&
1120 ctdb->recovery_lock_fd != -1) {
1121 close(ctdb->recovery_lock_fd);
1122 ctdb->recovery_lock_fd = -1;
1123 unban_all_nodes(ctdb);
1126 /* ok, let that guy become recmaster then */
1127 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1129 DEBUG(0, (__location__ " failed to send recmaster election request"));
1130 talloc_free(mem_ctx);
1134 /* release any bans */
1135 rec->last_culprit = (uint32_t)-1;
1136 talloc_free(rec->banned_nodes);
1137 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1138 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1140 talloc_free(mem_ctx);
1146 force the start of the election process
1148 static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn,
1149 struct ctdb_node_map *nodemap)
1152 struct ctdb_context *ctdb = rec->ctdb;
1154 /* set all nodes to recovery mode to stop all internode traffic */
1155 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1157 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1161 ret = send_election_request(rec, mem_ctx, pnn);
1163 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1167 /* wait for a few seconds to collect all responses */
1168 ctdb_wait_timeout(ctdb, ctdb->tunable.election_timeout);
1174 handler for when a node changes its flags
1176 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1177 TDB_DATA data, void *private_data)
1180 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1181 struct ctdb_node_map *nodemap=NULL;
1182 TALLOC_CTX *tmp_ctx;
1183 uint32_t changed_flags;
1185 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1187 if (data.dsize != sizeof(*c)) {
1188 DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1192 tmp_ctx = talloc_new(ctdb);
1193 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1195 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1197 for (i=0;i<nodemap->num;i++) {
1198 if (nodemap->nodes[i].pnn == c->pnn) break;
1201 if (i == nodemap->num) {
1202 DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1203 talloc_free(tmp_ctx);
1207 changed_flags = c->old_flags ^ c->new_flags;
1209 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1210 This flag is handled locally based on whether the local node
1211 can communicate with the node or not.
1213 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1214 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1215 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1218 if (nodemap->nodes[i].flags != c->new_flags) {
1219 DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1222 nodemap->nodes[i].flags = c->new_flags;
1224 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1225 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1228 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1229 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1233 ctdb->recovery_master == ctdb->pnn &&
1234 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
1236 /* Only do the takeover run if the perm disabled or unhealthy
1237 flags changed since these will cause an ip failover but not
1239 If the node became disconnected or banned this will also
1240 lead to an ip address failover but that is handled
1243 if (changed_flags & NODE_FLAGS_DISABLED) {
1244 rec->need_takeover_run = true;
1248 talloc_free(tmp_ctx);
1253 struct verify_recmode_normal_data {
1255 enum monitor_result status;
1258 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1260 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private, struct verify_recmode_normal_data);
1263 /* one more node has responded with recmode data*/
1266 /* if we failed to get the recmode, then return an error and let
1267 the main loop try again.
1269 if (state->state != CTDB_CONTROL_DONE) {
1270 if (rmdata->status == MONITOR_OK) {
1271 rmdata->status = MONITOR_FAILED;
1276 /* if we got a response, then the recmode will be stored in the
1279 if (state->status != CTDB_RECOVERY_NORMAL) {
1280 DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1281 rmdata->status = MONITOR_RECOVERY_NEEDED;
1288 /* verify that all nodes are in normal recovery mode */
1289 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1291 struct verify_recmode_normal_data *rmdata;
1292 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1293 struct ctdb_client_control_state *state;
1294 enum monitor_result status;
1297 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1298 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1300 rmdata->status = MONITOR_OK;
1302 /* loop over all active nodes and send an async getrecmode call to
1304 for (j=0; j<nodemap->num; j++) {
1305 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1308 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1310 nodemap->nodes[j].pnn);
1311 if (state == NULL) {
1312 /* we failed to send the control, treat this as
1313 an error and try again next iteration
1315 DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1316 talloc_free(mem_ctx);
1317 return MONITOR_FAILED;
1320 /* set up the callback functions */
1321 state->async.fn = verify_recmode_normal_callback;
1322 state->async.private = rmdata;
1324 /* one more control to wait for to complete */
1329 /* now wait for up to the maximum number of seconds allowed
1330 or until all nodes we expect a response from has replied
1332 while (rmdata->count > 0) {
1333 event_loop_once(ctdb->ev);
1336 status = rmdata->status;
1337 talloc_free(mem_ctx);
1342 struct verify_recmaster_data {
1345 enum monitor_result status;
1348 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1350 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private, struct verify_recmaster_data);
1353 /* one more node has responded with recmaster data*/
1356 /* if we failed to get the recmaster, then return an error and let
1357 the main loop try again.
1359 if (state->state != CTDB_CONTROL_DONE) {
1360 if (rmdata->status == MONITOR_OK) {
1361 rmdata->status = MONITOR_FAILED;
1366 /* if we got a response, then the recmaster will be stored in the
1369 if (state->status != rmdata->pnn) {
1370 DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
1371 rmdata->status = MONITOR_ELECTION_NEEDED;
1378 /* verify that all nodes agree that we are the recmaster */
1379 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
1381 struct verify_recmaster_data *rmdata;
1382 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1383 struct ctdb_client_control_state *state;
1384 enum monitor_result status;
1387 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1388 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1391 rmdata->status = MONITOR_OK;
1393 /* loop over all active nodes and send an async getrecmaster call to
1395 for (j=0; j<nodemap->num; j++) {
1396 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1399 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1401 nodemap->nodes[j].pnn);
1402 if (state == NULL) {
1403 /* we failed to send the control, treat this as
1404 an error and try again next iteration
1406 DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1407 talloc_free(mem_ctx);
1408 return MONITOR_FAILED;
1411 /* set up the callback functions */
1412 state->async.fn = verify_recmaster_callback;
1413 state->async.private = rmdata;
1415 /* one more control to wait for to complete */
1420 /* now wait for up to the maximum number of seconds allowed
1421 or until all nodes we expect a response from has replied
1423 while (rmdata->count > 0) {
1424 event_loop_once(ctdb->ev);
1427 status = rmdata->status;
1428 talloc_free(mem_ctx);
1434 the main monitoring loop
1436 static void monitor_cluster(struct ctdb_context *ctdb)
1438 uint32_t pnn, num_active, recmaster;
1439 TALLOC_CTX *mem_ctx=NULL;
1440 struct ctdb_node_map *nodemap=NULL;
1441 struct ctdb_node_map *remote_nodemap=NULL;
1442 struct ctdb_vnn_map *vnnmap=NULL;
1443 struct ctdb_vnn_map *remote_vnnmap=NULL;
1445 struct ctdb_recoverd *rec;
1446 struct ctdb_all_public_ips *ips;
1448 rec = talloc_zero(ctdb, struct ctdb_recoverd);
1449 CTDB_NO_MEMORY_FATAL(ctdb, rec);
1452 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1453 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1455 rec->priority_time = timeval_current();
1457 /* register a message port for recovery elections */
1458 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
1460 /* and one for when nodes are disabled/enabled */
1461 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
1463 /* and one for when nodes are banned */
1464 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
1466 /* and one for when nodes are unbanned */
1467 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
1471 talloc_free(mem_ctx);
1474 mem_ctx = talloc_new(ctdb);
1476 DEBUG(0,("Failed to create temporary context\n"));
1480 /* we only check for recovery once every second */
1481 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
1483 /* get relevant tunables */
1484 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
1486 DEBUG(0,("Failed to get tunables - retrying\n"));
1490 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1491 if (pnn == (uint32_t)-1) {
1492 DEBUG(0,("Failed to get local pnn - retrying\n"));
1496 /* get the vnnmap */
1497 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
1499 DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", pnn));
1504 /* get number of nodes */
1505 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &nodemap);
1507 DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", pnn));
1512 /* count how many active nodes there are */
1514 for (i=0; i<nodemap->num; i++) {
1515 if (rec->banned_nodes[nodemap->nodes[i].pnn] != NULL) {
1516 nodemap->nodes[i].flags |= NODE_FLAGS_BANNED;
1518 nodemap->nodes[i].flags &= ~NODE_FLAGS_BANNED;
1520 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1526 /* check which node is the recovery master */
1527 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &recmaster);
1529 DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", pnn));
1533 if (recmaster == (uint32_t)-1) {
1534 DEBUG(0,(__location__ " Initial recovery master set - forcing election\n"));
1535 force_election(rec, mem_ctx, pnn, nodemap);
1539 /* verify that the recmaster node is still active */
1540 for (j=0; j<nodemap->num; j++) {
1541 if (nodemap->nodes[j].pnn==recmaster) {
1546 if (j == nodemap->num) {
1547 DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster));
1548 force_election(rec, mem_ctx, pnn, nodemap);
1552 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1553 DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
1554 force_election(rec, mem_ctx, pnn, nodemap);
1558 /* verify that the public ip address allocation is consistent */
1559 if (ctdb->vnn != NULL) {
1560 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
1562 DEBUG(0, ("Unable to get public ips from node %u\n", i));
1565 for (j=0; j<ips->num; j++) {
1566 /* verify that we have the ip addresses we should have
1567 and we dont have ones we shouldnt have.
1568 if we find an inconsistency we set recmode to
1569 active on the local node and wait for the recmaster
1570 to do a full blown recovery
1572 if (ips->ips[j].pnn == pnn) {
1573 if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
1574 DEBUG(0,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
1575 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1577 DEBUG(0,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
1580 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
1582 DEBUG(0,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
1587 if (ctdb_sys_have_ip(ips->ips[j].sin)) {
1588 DEBUG(0,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
1589 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1591 DEBUG(0,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
1594 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
1596 DEBUG(0,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
1604 /* if we are not the recmaster then we do not need to check
1605 if recovery is needed
1607 if (pnn != recmaster) {
1612 /* update the list of public ips that a node can handle for
1615 for (j=0; j<nodemap->num; j++) {
1616 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1619 /* release any existing data */
1620 if (ctdb->nodes[j]->public_ips) {
1621 talloc_free(ctdb->nodes[j]->public_ips);
1622 ctdb->nodes[j]->public_ips = NULL;
1624 /* grab a new shiny list of public ips from the node */
1625 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
1626 ctdb->nodes[j]->pnn,
1628 &ctdb->nodes[j]->public_ips)) {
1629 DEBUG(0,("Failed to read public ips from node : %u\n",
1630 ctdb->nodes[j]->pnn));
1636 /* verify that all active nodes agree that we are the recmaster */
1637 switch (verify_recmaster(ctdb, nodemap, pnn)) {
1638 case MONITOR_RECOVERY_NEEDED:
1639 /* can not happen */
1641 case MONITOR_ELECTION_NEEDED:
1642 force_election(rec, mem_ctx, pnn, nodemap);
1646 case MONITOR_FAILED:
1651 if (rec->need_recovery) {
1652 /* a previous recovery didn't finish */
1653 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1657 /* verify that all active nodes are in normal mode
1658 and not in recovery mode
1660 switch (verify_recmode(ctdb, nodemap)) {
1661 case MONITOR_RECOVERY_NEEDED:
1662 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1664 case MONITOR_FAILED:
1666 case MONITOR_ELECTION_NEEDED:
1667 /* can not happen */
1674 /* get the nodemap for all active remote nodes and verify
1675 they are the same as for this node
1677 for (j=0; j<nodemap->num; j++) {
1678 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1681 if (nodemap->nodes[j].pnn == pnn) {
1685 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1686 mem_ctx, &remote_nodemap);
1688 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
1689 nodemap->nodes[j].pnn));
1693 /* if the nodes disagree on how many nodes there are
1694 then this is a good reason to try recovery
1696 if (remote_nodemap->num != nodemap->num) {
1697 DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
1698 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
1699 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1703 /* if the nodes disagree on which nodes exist and are
1704 active, then that is also a good reason to do recovery
1706 for (i=0;i<nodemap->num;i++) {
1707 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
1708 DEBUG(0, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
1709 nodemap->nodes[j].pnn, i,
1710 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
1711 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1712 vnnmap, nodemap->nodes[j].pnn);
1715 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
1716 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1717 DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
1718 nodemap->nodes[j].pnn, i,
1719 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
1720 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1721 vnnmap, nodemap->nodes[j].pnn);
1726 /* update our nodemap flags according to the other
1727 server - this gets the NODE_FLAGS_DISABLED
1728 flag. Note that the remote node is authoritative
1729 for its flags (except CONNECTED, which we know
1730 matches in this code) */
1731 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1732 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1733 rec->need_takeover_run = true;
1738 /* there better be the same number of lmasters in the vnn map
1739 as there are active nodes or we will have to do a recovery
1741 if (vnnmap->size != num_active) {
1742 DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
1743 vnnmap->size, num_active));
1744 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
1748 /* verify that all active nodes in the nodemap also exist in
1751 for (j=0; j<nodemap->num; j++) {
1752 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1755 if (nodemap->nodes[j].pnn == pnn) {
1759 for (i=0; i<vnnmap->size; i++) {
1760 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
1764 if (i == vnnmap->size) {
1765 DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
1766 nodemap->nodes[j].pnn));
1767 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1773 /* verify that all other nodes have the same vnnmap
1774 and are from the same generation
1776 for (j=0; j<nodemap->num; j++) {
1777 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1780 if (nodemap->nodes[j].pnn == pnn) {
1784 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1785 mem_ctx, &remote_vnnmap);
1787 DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n",
1788 nodemap->nodes[j].pnn));
1792 /* verify the vnnmap generation is the same */
1793 if (vnnmap->generation != remote_vnnmap->generation) {
1794 DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
1795 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
1796 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1800 /* verify the vnnmap size is the same */
1801 if (vnnmap->size != remote_vnnmap->size) {
1802 DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
1803 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
1804 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1808 /* verify the vnnmap is the same */
1809 for (i=0;i<vnnmap->size;i++) {
1810 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
1811 DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n",
1812 nodemap->nodes[j].pnn));
1813 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1814 vnnmap, nodemap->nodes[j].pnn);
1820 /* we might need to change who has what IP assigned */
1821 if (rec->need_takeover_run) {
1822 rec->need_takeover_run = false;
1823 ret = ctdb_takeover_run(ctdb, nodemap);
1825 DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
1826 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1827 vnnmap, nodemap->nodes[j].pnn);
1836 event handler for when the main ctdbd dies
1838 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
1839 uint16_t flags, void *private_data)
1841 DEBUG(0,("recovery daemon parent died - exiting\n"));
1846 startup the recovery daemon as a child of the main ctdb daemon
1848 int ctdb_start_recoverd(struct ctdb_context *ctdb)
1854 if (pipe(fd) != 0) {
1870 /* shutdown the transport */
1871 ctdb->methods->shutdown(ctdb);
1873 /* get a new event context */
1874 talloc_free(ctdb->ev);
1875 ctdb->ev = event_context_init(ctdb);
1877 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
1878 ctdb_recoverd_parent, &fd[0]);
1880 close(ctdb->daemon.sd);
1881 ctdb->daemon.sd = -1;
1883 srandom(getpid() ^ time(NULL));
1885 /* initialise ctdb */
1886 ret = ctdb_socket_connect(ctdb);
1888 DEBUG(0, (__location__ " Failed to init ctdb\n"));
1892 monitor_cluster(ctdb);
1894 DEBUG(0,("ERROR: ctdb_recoverd finished!?\n"));