4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
26 #include "../include/ctdb.h"
27 #include "../include/ctdb_private.h"
31 struct ctdb_recoverd *rec;
36 private state of recovery daemon
38 struct ctdb_recoverd {
39 struct ctdb_context *ctdb;
40 uint32_t last_culprit;
41 uint32_t culprit_counter;
42 struct timeval first_recover_time;
43 struct ban_state **banned_nodes;
44 struct timeval priority_time;
47 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
48 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
53 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
55 struct ctdb_context *ctdb = rec->ctdb;
57 if (!ctdb_validate_pnn(ctdb, pnn)) {
58 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
62 if (rec->banned_nodes[pnn] == NULL) {
66 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
68 talloc_free(rec->banned_nodes[pnn]);
69 rec->banned_nodes[pnn] = NULL;
74 called when a ban has timed out
76 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
78 struct ban_state *state = talloc_get_type(p, struct ban_state);
79 struct ctdb_recoverd *rec = state->rec;
80 uint32_t pnn = state->banned_node;
82 DEBUG(0,("Node %u is now unbanned\n", pnn));
83 ctdb_unban_node(rec, pnn);
87 ban a node for a period of time
89 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
91 struct ctdb_context *ctdb = rec->ctdb;
93 if (!ctdb_validate_pnn(ctdb, pnn)) {
94 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
98 if (pnn == ctdb->pnn) {
99 DEBUG(0,("self ban - lowering our election priority\n"));
100 /* banning ourselves - lower our election priority */
101 rec->priority_time = timeval_current();
104 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
106 rec->banned_nodes[pnn] = talloc(rec, struct ban_state);
107 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
109 rec->banned_nodes[pnn]->rec = rec;
110 rec->banned_nodes[pnn]->banned_node = pnn;
113 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
114 timeval_current_ofs(ban_time, 0),
115 ctdb_ban_timeout, rec->banned_nodes[pnn]);
119 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
122 struct freeze_node_data {
124 enum monitor_result status;
128 static void freeze_node_callback(struct ctdb_client_control_state *state)
130 struct freeze_node_data *fndata = talloc_get_type(state->async.private, struct freeze_node_data);
133 /* one more node has responded to our freeze node*/
136 /* if we failed to freeze the node, we must trigger another recovery */
137 if ( (state->state != CTDB_CONTROL_DONE) || (state->status != 0) ) {
138 DEBUG(0, (__location__ " Failed to freeze node:%u. recovery failed\n", state->c->hdr.destnode));
139 fndata->status = MONITOR_RECOVERY_NEEDED;
147 /* freeze all nodes */
148 static enum monitor_result freeze_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
150 struct freeze_node_data *fndata;
151 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
152 struct ctdb_client_control_state *state;
153 enum monitor_result status;
156 fndata = talloc(mem_ctx, struct freeze_node_data);
157 CTDB_NO_MEMORY_FATAL(ctdb, fndata);
159 fndata->status = MONITOR_OK;
161 /* loop over all active nodes and send an async freeze call to
163 for (j=0; j<nodemap->num; j++) {
164 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
167 state = ctdb_ctrl_freeze_send(ctdb, mem_ctx,
169 nodemap->nodes[j].pnn);
171 /* we failed to send the control, treat this as
172 an error and try again next iteration
174 DEBUG(0,("Failed to call ctdb_ctrl_freeze_send during recovery\n"));
175 talloc_free(mem_ctx);
176 return MONITOR_RECOVERY_NEEDED;
179 /* set up the callback functions */
180 state->async.fn = freeze_node_callback;
181 state->async.private = fndata;
183 /* one more control to wait for to complete */
188 /* now wait for up to the maximum number of seconds allowed
189 or until all nodes we expect a response from has replied
191 while (fndata->count > 0) {
192 event_loop_once(ctdb->ev);
195 status = fndata->status;
196 talloc_free(mem_ctx);
202 change recovery mode on all nodes
204 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
208 /* freeze all nodes */
209 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
210 ret = freeze_all_nodes(ctdb, nodemap);
211 if (ret != MONITOR_OK) {
212 DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
218 /* set recovery mode to active on all nodes */
219 for (j=0; j<nodemap->num; j++) {
220 /* dont change it for nodes that are unavailable */
221 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
225 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, rec_mode);
227 DEBUG(0, (__location__ " Unable to set recmode on node %u\n", nodemap->nodes[j].pnn));
231 if (rec_mode == CTDB_RECOVERY_NORMAL) {
232 ret = ctdb_ctrl_thaw(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn);
234 DEBUG(0, (__location__ " Unable to thaw node %u\n", nodemap->nodes[j].pnn));
244 change recovery master on all node
246 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
250 /* set recovery master to pnn on all nodes */
251 for (j=0; j<nodemap->num; j++) {
252 /* dont change it for nodes that are unavailable */
253 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
257 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, pnn);
259 DEBUG(0, (__location__ " Unable to set recmaster on node %u\n", nodemap->nodes[j].pnn));
269 ensure all other nodes have attached to any databases that we have
271 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
272 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
275 struct ctdb_dbid_map *remote_dbmap;
277 /* verify that all other nodes have all our databases */
278 for (j=0; j<nodemap->num; j++) {
279 /* we dont need to ourself ourselves */
280 if (nodemap->nodes[j].pnn == pnn) {
283 /* dont check nodes that are unavailable */
284 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
288 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
289 mem_ctx, &remote_dbmap);
291 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
295 /* step through all local databases */
296 for (db=0; db<dbmap->num;db++) {
300 for (i=0;i<remote_dbmap->num;i++) {
301 if (dbmap->dbids[db] == remote_dbmap->dbids[i]) {
305 /* the remote node already have this database */
306 if (i!=remote_dbmap->num) {
309 /* ok so we need to create this database */
310 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbids[db], mem_ctx, &name);
312 DEBUG(0, (__location__ " Unable to get dbname from node %u\n", pnn));
315 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, name);
317 DEBUG(0, (__location__ " Unable to create remote db:%s\n", name));
328 ensure we are attached to any databases that anyone else is attached to
330 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
331 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
334 struct ctdb_dbid_map *remote_dbmap;
336 /* verify that we have all database any other node has */
337 for (j=0; j<nodemap->num; j++) {
338 /* we dont need to ourself ourselves */
339 if (nodemap->nodes[j].pnn == pnn) {
342 /* dont check nodes that are unavailable */
343 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
347 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
348 mem_ctx, &remote_dbmap);
350 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
354 /* step through all databases on the remote node */
355 for (db=0; db<remote_dbmap->num;db++) {
358 for (i=0;i<(*dbmap)->num;i++) {
359 if (remote_dbmap->dbids[db] == (*dbmap)->dbids[i]) {
363 /* we already have this db locally */
364 if (i!=(*dbmap)->num) {
367 /* ok so we need to create this database and
370 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
371 remote_dbmap->dbids[db], mem_ctx, &name);
373 DEBUG(0, (__location__ " Unable to get dbname from node %u\n",
374 nodemap->nodes[j].pnn));
377 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name);
379 DEBUG(0, (__location__ " Unable to create local db:%s\n", name));
382 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
384 DEBUG(0, (__location__ " Unable to reread dbmap on node %u\n", pnn));
395 pull all the remote database contents into ours
397 static int pull_all_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
398 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
402 /* pull all records from all other nodes across onto this node
403 (this merges based on rsn)
405 for (i=0;i<dbmap->num;i++) {
406 for (j=0; j<nodemap->num; j++) {
407 /* we dont need to merge with ourselves */
408 if (nodemap->nodes[j].pnn == pnn) {
411 /* dont merge from nodes that are unavailable */
412 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
415 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
416 pnn, dbmap->dbids[i], CTDB_LMASTER_ANY, mem_ctx);
418 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
419 nodemap->nodes[j].pnn, pnn));
430 change the dmaster on all databases to point to us
432 static int update_dmaster_on_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
433 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
437 /* update dmaster to point to this node for all databases/nodes */
438 for (i=0;i<dbmap->num;i++) {
439 for (j=0; j<nodemap->num; j++) {
440 /* dont repoint nodes that are unavailable */
441 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
444 ret = ctdb_ctrl_setdmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, ctdb, dbmap->dbids[i], pnn);
446 DEBUG(0, (__location__ " Unable to set dmaster for node %u db:0x%08x\n", nodemap->nodes[j].pnn, dbmap->dbids[i]));
457 update flags on all active nodes
459 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
462 for (i=0;i<nodemap->num;i++) {
463 struct ctdb_node_flag_change c;
466 c.pnn = nodemap->nodes[i].pnn;
467 c.old_flags = nodemap->nodes[i].flags;
468 c.new_flags = nodemap->nodes[i].flags;
470 data.dptr = (uint8_t *)&c;
471 data.dsize = sizeof(c);
473 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
474 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
483 static int vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *nodemap)
488 /* find max rsn on our local node for this db */
489 ret = ctdb_ctrl_get_max_rsn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, db_id, &max_rsn);
494 /* set rsn on non-empty records to max_rsn+1 */
495 for (i=0;i<nodemap->num;i++) {
496 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
499 ret = ctdb_ctrl_set_rsn_nonempty(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
502 DEBUG(0,(__location__ " Failed to set rsn on node %u to %llu\n",
503 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
508 /* delete records with rsn < max_rsn+1 on all nodes */
509 for (i=0;i<nodemap->num;i++) {
510 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
513 ret = ctdb_ctrl_delete_low_rsn(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn,
516 DEBUG(0,(__location__ " Failed to delete records on node %u with rsn below %llu\n",
517 nodemap->nodes[i].pnn, (unsigned long long)max_rsn+1));
528 vacuum all attached databases
530 static int vacuum_all_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
531 struct ctdb_dbid_map *dbmap)
535 /* update dmaster to point to this node for all databases/nodes */
536 for (i=0;i<dbmap->num;i++) {
537 if (vacuum_db(ctdb, dbmap->dbids[i], nodemap) != 0) {
546 push out all our database contents to all other nodes
548 static int push_all_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
549 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
553 /* push all records out to the nodes again */
554 for (i=0;i<dbmap->num;i++) {
555 for (j=0; j<nodemap->num; j++) {
556 /* we dont need to push to ourselves */
557 if (nodemap->nodes[j].pnn == pnn) {
560 /* dont push to nodes that are unavailable */
561 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
564 ret = ctdb_ctrl_copydb(ctdb, CONTROL_TIMEOUT(), pnn, nodemap->nodes[j].pnn,
565 dbmap->dbids[i], CTDB_LMASTER_ANY, mem_ctx);
567 DEBUG(0, (__location__ " Unable to copy db from node %u to node %u\n",
568 pnn, nodemap->nodes[j].pnn));
579 ensure all nodes have the same vnnmap we do
581 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
582 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
586 /* push the new vnn map out to all the nodes */
587 for (j=0; j<nodemap->num; j++) {
588 /* dont push to nodes that are unavailable */
589 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
593 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
595 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
605 handler for when the admin bans a node
607 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
608 TDB_DATA data, void *private_data)
610 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
611 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
612 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
616 if (data.dsize != sizeof(*b)) {
617 DEBUG(0,("Bad data in ban_handler\n"));
618 talloc_free(mem_ctx);
622 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
624 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
625 talloc_free(mem_ctx);
629 if (recmaster != ctdb->pnn) {
630 DEBUG(0,("We are not the recmaster - ignoring ban request\n"));
631 talloc_free(mem_ctx);
635 DEBUG(0,("Node %u has been banned for %u seconds by the administrator\n",
636 b->pnn, b->ban_time));
637 ctdb_ban_node(rec, b->pnn, b->ban_time);
638 talloc_free(mem_ctx);
642 handler for when the admin unbans a node
644 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
645 TDB_DATA data, void *private_data)
647 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
648 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
653 if (data.dsize != sizeof(uint32_t)) {
654 DEBUG(0,("Bad data in unban_handler\n"));
655 talloc_free(mem_ctx);
658 pnn = *(uint32_t *)data.dptr;
660 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
662 DEBUG(0,(__location__ " Failed to find the recmaster\n"));
663 talloc_free(mem_ctx);
667 if (recmaster != ctdb->pnn) {
668 DEBUG(0,("We are not the recmaster - ignoring unban request\n"));
669 talloc_free(mem_ctx);
673 DEBUG(0,("Node %u has been unbanned by the administrator\n", pnn));
674 ctdb_unban_node(rec, pnn);
675 talloc_free(mem_ctx);
681 called when ctdb_wait_timeout should finish
683 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
684 struct timeval yt, void *p)
686 uint32_t *timed_out = (uint32_t *)p;
691 wait for a given number of seconds
693 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
695 uint32_t timed_out = 0;
696 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
698 event_loop_once(ctdb->ev);
702 /* Create a new random generation ip.
703 The generation id can not be the INVALID_GENERATION id
705 static uint32_t new_generation(void)
710 generation = random();
712 if (generation != INVALID_GENERATION) {
721 we are the recmaster, and recovery is needed - start a recovery run
723 static int do_recovery(struct ctdb_recoverd *rec,
724 TALLOC_CTX *mem_ctx, uint32_t pnn, uint32_t num_active,
725 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
728 struct ctdb_context *ctdb = rec->ctdb;
731 struct ctdb_dbid_map *dbmap;
733 if (rec->last_culprit != culprit ||
734 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
735 /* either a new node is the culprit, or we've decide to forgive them */
736 rec->last_culprit = culprit;
737 rec->first_recover_time = timeval_current();
738 rec->culprit_counter = 0;
740 rec->culprit_counter++;
742 if (rec->culprit_counter > 2*nodemap->num) {
743 DEBUG(0,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
744 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
745 ctdb->tunable.recovery_ban_period));
746 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
749 if (!ctdb_recovery_lock(ctdb, true)) {
750 DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
754 /* set recovery mode to active on all nodes */
755 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
757 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
761 DEBUG(0, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
763 /* pick a new generation number */
764 generation = new_generation();
766 /* change the vnnmap on this node to use the new generation
767 number but not on any other nodes.
768 this guarantees that if we abort the recovery prematurely
769 for some reason (a node stops responding?)
770 that we can just return immediately and we will reenter
771 recovery shortly again.
772 I.e. we deliberately leave the cluster with an inconsistent
773 generation id to allow us to abort recovery at any stage and
774 just restart it from scratch.
776 vnnmap->generation = generation;
777 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
779 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
783 /* get a list of all databases */
784 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
786 DEBUG(0, (__location__ " Unable to get dbids from node :%u\n", pnn));
792 /* verify that all other nodes have all our databases */
793 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
795 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
799 /* verify that we have all the databases any other node has */
800 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
802 DEBUG(0, (__location__ " Unable to create missing local databases\n"));
808 /* verify that all other nodes have all our databases */
809 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
811 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
816 DEBUG(1, (__location__ " Recovery - created remote databases\n"));
818 /* pull all remote databases onto the local node */
819 ret = pull_all_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
821 DEBUG(0, (__location__ " Unable to pull remote databases\n"));
825 DEBUG(1, (__location__ " Recovery - pulled remote databases\n"));
827 /* push all local databases to the remote nodes */
828 ret = push_all_local_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
830 DEBUG(0, (__location__ " Unable to push local databases\n"));
834 DEBUG(1, (__location__ " Recovery - pushed remote databases\n"));
836 /* build a new vnn map with all the currently active and
838 generation = new_generation();
839 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
840 CTDB_NO_MEMORY(ctdb, vnnmap);
841 vnnmap->generation = generation;
842 vnnmap->size = num_active;
843 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
844 for (i=j=0;i<nodemap->num;i++) {
845 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
846 vnnmap->map[j++] = nodemap->nodes[i].pnn;
852 /* update to the new vnnmap on all nodes */
853 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
855 DEBUG(0, (__location__ " Unable to update vnnmap on all nodes\n"));
859 DEBUG(1, (__location__ " Recovery - updated vnnmap\n"));
861 /* update recmaster to point to us for all nodes */
862 ret = set_recovery_master(ctdb, nodemap, pnn);
864 DEBUG(0, (__location__ " Unable to set recovery master\n"));
868 DEBUG(1, (__location__ " Recovery - updated recmaster\n"));
870 /* repoint all local and remote database records to the local
871 node as being dmaster
873 ret = update_dmaster_on_all_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
875 DEBUG(0, (__location__ " Unable to update dmaster on all databases\n"));
879 DEBUG(1, (__location__ " Recovery - updated dmaster on all databases\n"));
882 update all nodes to have the same flags that we have
884 ret = update_flags_on_all_nodes(ctdb, nodemap);
886 DEBUG(0, (__location__ " Unable to update flags on all nodes\n"));
890 DEBUG(1, (__location__ " Recovery - updated flags\n"));
893 run a vacuum operation on empty records
895 ret = vacuum_all_databases(ctdb, nodemap, dbmap);
897 DEBUG(0, (__location__ " Unable to vacuum all databases\n"));
901 DEBUG(1, (__location__ " Recovery - vacuumed all databases\n"));
904 if enabled, tell nodes to takeover their public IPs
907 ret = ctdb_takeover_run(ctdb, nodemap);
909 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
912 DEBUG(1, (__location__ " Recovery - done takeover\n"));
916 /* disable recovery mode */
917 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
919 DEBUG(0, (__location__ " Unable to set recovery mode to normal on cluster\n"));
923 /* send a message to all clients telling them that the cluster
924 has been reconfigured */
925 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
927 DEBUG(0, (__location__ " Recovery complete\n"));
929 /* We just finished a recovery successfully.
930 We now wait for rerecovery_timeout before we allow
931 another recovery to take place.
933 DEBUG(0, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
934 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
935 DEBUG(0, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
942 elections are won by first checking the number of connected nodes, then
943 the priority time, then the pnn
945 struct election_message {
946 uint32_t num_connected;
947 struct timeval priority_time;
952 form this nodes election data
954 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
957 struct ctdb_node_map *nodemap;
958 struct ctdb_context *ctdb = rec->ctdb;
962 em->pnn = rec->ctdb->pnn;
963 em->priority_time = rec->priority_time;
965 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
970 for (i=0;i<nodemap->num;i++) {
971 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
975 talloc_free(nodemap);
979 see if the given election data wins
981 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
983 struct election_message myem;
986 ctdb_election_data(rec, &myem);
988 /* try to use the most connected node */
989 cmp = (int)myem.num_connected - (int)em->num_connected;
991 /* then the longest running node */
993 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
997 cmp = (int)myem.pnn - (int)em->pnn;
1004 send out an election request
1006 static int send_election_request(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn)
1009 TDB_DATA election_data;
1010 struct election_message emsg;
1012 struct ctdb_context *ctdb = rec->ctdb;
1014 srvid = CTDB_SRVID_RECOVERY;
1016 ctdb_election_data(rec, &emsg);
1018 election_data.dsize = sizeof(struct election_message);
1019 election_data.dptr = (unsigned char *)&emsg;
1022 /* first we assume we will win the election and set
1023 recoverymaster to be ourself on the current node
1025 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1027 DEBUG(0, (__location__ " failed to send recmaster election request\n"));
1032 /* send an election message to all active nodes */
1033 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1039 this function will unban all nodes in the cluster
1041 static void unban_all_nodes(struct ctdb_context *ctdb)
1044 struct ctdb_node_map *nodemap;
1045 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1047 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1049 DEBUG(0,(__location__ " failed to get nodemap to unban all nodes\n"));
1053 for (i=0;i<nodemap->num;i++) {
1054 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1055 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1056 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1060 talloc_free(tmp_ctx);
1064 handler for recovery master elections
1066 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1067 TDB_DATA data, void *private_data)
1069 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1071 struct election_message *em = (struct election_message *)data.dptr;
1072 TALLOC_CTX *mem_ctx;
1074 mem_ctx = talloc_new(ctdb);
1076 /* someone called an election. check their election data
1077 and if we disagree and we would rather be the elected node,
1078 send a new election message to all other nodes
1080 if (ctdb_election_win(rec, em)) {
1081 ret = send_election_request(rec, mem_ctx, ctdb_get_pnn(ctdb));
1083 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1085 talloc_free(mem_ctx);
1086 /*unban_all_nodes(ctdb);*/
1090 /* release the recmaster lock */
1091 if (em->pnn != ctdb->pnn &&
1092 ctdb->recovery_lock_fd != -1) {
1093 close(ctdb->recovery_lock_fd);
1094 ctdb->recovery_lock_fd = -1;
1095 unban_all_nodes(ctdb);
1098 /* ok, let that guy become recmaster then */
1099 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1101 DEBUG(0, (__location__ " failed to send recmaster election request"));
1102 talloc_free(mem_ctx);
1106 /* release any bans */
1107 rec->last_culprit = (uint32_t)-1;
1108 talloc_free(rec->banned_nodes);
1109 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1110 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1112 talloc_free(mem_ctx);
1118 force the start of the election process
1120 static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn,
1121 struct ctdb_node_map *nodemap)
1124 struct ctdb_context *ctdb = rec->ctdb;
1126 /* set all nodes to recovery mode to stop all internode traffic */
1127 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1129 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1133 ret = send_election_request(rec, mem_ctx, pnn);
1135 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1139 /* wait for a few seconds to collect all responses */
1140 ctdb_wait_timeout(ctdb, ctdb->tunable.election_timeout);
1146 handler for when a node changes its flags
1148 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1149 TDB_DATA data, void *private_data)
1152 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1153 struct ctdb_node_map *nodemap=NULL;
1154 TALLOC_CTX *tmp_ctx;
1155 uint32_t changed_flags;
1158 if (data.dsize != sizeof(*c)) {
1159 DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1163 tmp_ctx = talloc_new(ctdb);
1164 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1166 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1168 for (i=0;i<nodemap->num;i++) {
1169 if (nodemap->nodes[i].pnn == c->pnn) break;
1172 if (i == nodemap->num) {
1173 DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1174 talloc_free(tmp_ctx);
1178 changed_flags = c->old_flags ^ c->new_flags;
1180 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1181 This flag is handled locally based on whether the local node
1182 can communicate with the node or not.
1184 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1185 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1186 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1189 if (nodemap->nodes[i].flags != c->new_flags) {
1190 DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1193 nodemap->nodes[i].flags = c->new_flags;
1195 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1196 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1199 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1200 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1204 ctdb->recovery_master == ctdb->pnn &&
1205 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
1207 /* Only do the takeover run if the perm disabled or unhealthy
1208 flags changed since these will cause an ip failover but not
1210 If the node became disconnected or banned this will also
1211 lead to an ip address failover but that is handled
1214 if (changed_flags & NODE_FLAGS_DISABLED) {
1215 ret = ctdb_takeover_run(ctdb, nodemap);
1217 DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
1218 ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(),
1219 ctdb->pnn, CTDB_RECOVERY_ACTIVE);
1221 /* send a message to all clients telling them that the
1222 cluster has been reconfigured */
1223 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1227 talloc_free(tmp_ctx);
1232 struct verify_recmode_normal_data {
1234 enum monitor_result status;
1237 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1239 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private, struct verify_recmode_normal_data);
1242 /* one more node has responded with recmode data*/
1245 /* if we failed to get the recmode, then return an error and let
1246 the main loop try again.
1248 if (state->state != CTDB_CONTROL_DONE) {
1249 if (rmdata->status == MONITOR_OK) {
1250 rmdata->status = MONITOR_FAILED;
1255 /* if we got a response, then the recmode will be stored in the
1258 if (state->status != CTDB_RECOVERY_NORMAL) {
1259 DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1260 rmdata->status = MONITOR_RECOVERY_NEEDED;
1267 /* verify that all nodes are in normal recovery mode */
1268 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1270 struct verify_recmode_normal_data *rmdata;
1271 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1272 struct ctdb_client_control_state *state;
1273 enum monitor_result status;
1276 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1277 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1279 rmdata->status = MONITOR_OK;
1281 /* loop over all active nodes and send an async getrecmode call to
1283 for (j=0; j<nodemap->num; j++) {
1284 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1287 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1289 nodemap->nodes[j].pnn);
1290 if (state == NULL) {
1291 /* we failed to send the control, treat this as
1292 an error and try again next iteration
1294 DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1295 talloc_free(mem_ctx);
1296 return MONITOR_FAILED;
1299 /* set up the callback functions */
1300 state->async.fn = verify_recmode_normal_callback;
1301 state->async.private = rmdata;
1303 /* one more control to wait for to complete */
1308 /* now wait for up to the maximum number of seconds allowed
1309 or until all nodes we expect a response from has replied
1311 while (rmdata->count > 0) {
1312 event_loop_once(ctdb->ev);
1315 status = rmdata->status;
1316 talloc_free(mem_ctx);
1321 struct verify_recmaster_data {
1324 enum monitor_result status;
1327 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1329 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private, struct verify_recmaster_data);
1332 /* one more node has responded with recmaster data*/
1335 /* if we failed to get the recmaster, then return an error and let
1336 the main loop try again.
1338 if (state->state != CTDB_CONTROL_DONE) {
1339 if (rmdata->status == MONITOR_OK) {
1340 rmdata->status = MONITOR_FAILED;
1345 /* if we got a response, then the recmaster will be stored in the
1348 if (state->status != rmdata->pnn) {
1349 DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
1350 rmdata->status = MONITOR_ELECTION_NEEDED;
1357 /* verify that all nodes agree that we are the recmaster */
1358 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
1360 struct verify_recmaster_data *rmdata;
1361 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1362 struct ctdb_client_control_state *state;
1363 enum monitor_result status;
1366 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1367 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1370 rmdata->status = MONITOR_OK;
1372 /* loop over all active nodes and send an async getrecmaster call to
1374 for (j=0; j<nodemap->num; j++) {
1375 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1378 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1380 nodemap->nodes[j].pnn);
1381 if (state == NULL) {
1382 /* we failed to send the control, treat this as
1383 an error and try again next iteration
1385 DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1386 talloc_free(mem_ctx);
1387 return MONITOR_FAILED;
1390 /* set up the callback functions */
1391 state->async.fn = verify_recmaster_callback;
1392 state->async.private = rmdata;
1394 /* one more control to wait for to complete */
1399 /* now wait for up to the maximum number of seconds allowed
1400 or until all nodes we expect a response from has replied
1402 while (rmdata->count > 0) {
1403 event_loop_once(ctdb->ev);
1406 status = rmdata->status;
1407 talloc_free(mem_ctx);
1413 the main monitoring loop
1415 static void monitor_cluster(struct ctdb_context *ctdb)
1417 uint32_t pnn, num_active, recmaster;
1418 TALLOC_CTX *mem_ctx=NULL;
1419 struct ctdb_node_map *nodemap=NULL;
1420 struct ctdb_node_map *remote_nodemap=NULL;
1421 struct ctdb_vnn_map *vnnmap=NULL;
1422 struct ctdb_vnn_map *remote_vnnmap=NULL;
1424 bool need_takeover_run;
1425 struct ctdb_recoverd *rec;
1427 rec = talloc_zero(ctdb, struct ctdb_recoverd);
1428 CTDB_NO_MEMORY_FATAL(ctdb, rec);
1431 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1432 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1434 rec->priority_time = timeval_current();
1436 /* register a message port for recovery elections */
1437 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
1439 /* and one for when nodes are disabled/enabled */
1440 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
1442 /* and one for when nodes are banned */
1443 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
1445 /* and one for when nodes are unbanned */
1446 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
1449 need_takeover_run = false;
1452 talloc_free(mem_ctx);
1455 mem_ctx = talloc_new(ctdb);
1457 DEBUG(0,("Failed to create temporary context\n"));
1461 /* we only check for recovery once every second */
1462 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
1464 /* get relevant tunables */
1465 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
1467 DEBUG(0,("Failed to get tunables - retrying\n"));
1471 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1472 if (pnn == (uint32_t)-1) {
1473 DEBUG(0,("Failed to get local pnn - retrying\n"));
1477 /* get the vnnmap */
1478 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
1480 DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", pnn));
1485 /* get number of nodes */
1486 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &nodemap);
1488 DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", pnn));
1493 /* count how many active nodes there are */
1495 for (i=0; i<nodemap->num; i++) {
1496 if (rec->banned_nodes[nodemap->nodes[i].pnn] != NULL) {
1497 nodemap->nodes[i].flags |= NODE_FLAGS_BANNED;
1499 nodemap->nodes[i].flags &= ~NODE_FLAGS_BANNED;
1501 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1507 /* check which node is the recovery master */
1508 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &recmaster);
1510 DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", pnn));
1514 if (recmaster == (uint32_t)-1) {
1515 DEBUG(0,(__location__ " Initial recovery master set - forcing election\n"));
1516 force_election(rec, mem_ctx, pnn, nodemap);
1520 /* verify that the recmaster node is still active */
1521 for (j=0; j<nodemap->num; j++) {
1522 if (nodemap->nodes[j].pnn==recmaster) {
1527 if (j == nodemap->num) {
1528 DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster));
1529 force_election(rec, mem_ctx, pnn, nodemap);
1533 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1534 DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
1535 force_election(rec, mem_ctx, pnn, nodemap);
1540 /* if we are not the recmaster then we do not need to check
1541 if recovery is needed
1543 if (pnn!=recmaster) {
1548 /* update the list of public ips that a node can handle for
1551 for (j=0; j<nodemap->num; j++) {
1552 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1555 /* release any existing data */
1556 if (ctdb->nodes[j]->public_ips) {
1557 talloc_free(ctdb->nodes[j]->public_ips);
1558 ctdb->nodes[j]->public_ips = NULL;
1560 /* grab a new shiny list of public ips from the node */
1561 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
1562 ctdb->nodes[j]->pnn,
1564 &ctdb->nodes[j]->public_ips)) {
1565 DEBUG(0,("Failed to read public ips from node : %u\n",
1566 ctdb->nodes[j]->pnn));
1572 /* verify that all active nodes agree that we are the recmaster */
1573 switch (verify_recmaster(ctdb, nodemap, pnn)) {
1574 case MONITOR_RECOVERY_NEEDED:
1575 /* can not happen */
1577 case MONITOR_ELECTION_NEEDED:
1578 force_election(rec, mem_ctx, pnn, nodemap);
1582 case MONITOR_FAILED:
1587 /* verify that all active nodes are in normal mode
1588 and not in recovery mode
1590 switch (verify_recmode(ctdb, nodemap)) {
1591 case MONITOR_RECOVERY_NEEDED:
1592 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1594 case MONITOR_FAILED:
1596 case MONITOR_ELECTION_NEEDED:
1597 /* can not happen */
1604 /* get the nodemap for all active remote nodes and verify
1605 they are the same as for this node
1607 for (j=0; j<nodemap->num; j++) {
1608 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1611 if (nodemap->nodes[j].pnn == pnn) {
1615 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1616 mem_ctx, &remote_nodemap);
1618 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
1619 nodemap->nodes[j].pnn));
1623 /* if the nodes disagree on how many nodes there are
1624 then this is a good reason to try recovery
1626 if (remote_nodemap->num != nodemap->num) {
1627 DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
1628 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
1629 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1633 /* if the nodes disagree on which nodes exist and are
1634 active, then that is also a good reason to do recovery
1636 for (i=0;i<nodemap->num;i++) {
1637 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
1638 DEBUG(0, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
1639 nodemap->nodes[j].pnn, i,
1640 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
1641 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1642 vnnmap, nodemap->nodes[j].pnn);
1645 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
1646 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1647 DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
1648 nodemap->nodes[j].pnn, i,
1649 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
1650 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1651 vnnmap, nodemap->nodes[j].pnn);
1656 /* update our nodemap flags according to the other
1657 server - this gets the NODE_FLAGS_DISABLED
1658 flag. Note that the remote node is authoritative
1659 for its flags (except CONNECTED, which we know
1660 matches in this code) */
1661 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1662 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1663 need_takeover_run = true;
1668 /* there better be the same number of lmasters in the vnn map
1669 as there are active nodes or we will have to do a recovery
1671 if (vnnmap->size != num_active) {
1672 DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
1673 vnnmap->size, num_active));
1674 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
1678 /* verify that all active nodes in the nodemap also exist in
1681 for (j=0; j<nodemap->num; j++) {
1682 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1685 if (nodemap->nodes[j].pnn == pnn) {
1689 for (i=0; i<vnnmap->size; i++) {
1690 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
1694 if (i == vnnmap->size) {
1695 DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
1696 nodemap->nodes[j].pnn));
1697 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1703 /* verify that all other nodes have the same vnnmap
1704 and are from the same generation
1706 for (j=0; j<nodemap->num; j++) {
1707 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1710 if (nodemap->nodes[j].pnn == pnn) {
1714 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1715 mem_ctx, &remote_vnnmap);
1717 DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n",
1718 nodemap->nodes[j].pnn));
1722 /* verify the vnnmap generation is the same */
1723 if (vnnmap->generation != remote_vnnmap->generation) {
1724 DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
1725 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
1726 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1730 /* verify the vnnmap size is the same */
1731 if (vnnmap->size != remote_vnnmap->size) {
1732 DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
1733 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
1734 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
1738 /* verify the vnnmap is the same */
1739 for (i=0;i<vnnmap->size;i++) {
1740 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
1741 DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n",
1742 nodemap->nodes[j].pnn));
1743 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
1744 vnnmap, nodemap->nodes[j].pnn);
1750 /* we might need to change who has what IP assigned */
1751 if (need_takeover_run && ctdb->vnn) {
1752 ret = ctdb_takeover_run(ctdb, nodemap);
1754 DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
1755 ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(),
1756 ctdb->pnn, CTDB_RECOVERY_ACTIVE);
1765 event handler for when the main ctdbd dies
1767 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
1768 uint16_t flags, void *private_data)
1770 DEBUG(0,("recovery daemon parent died - exiting\n"));
1777 startup the recovery daemon as a child of the main ctdb daemon
1779 int ctdb_start_recoverd(struct ctdb_context *ctdb)
1785 if (pipe(fd) != 0) {
1801 /* shutdown the transport */
1802 ctdb->methods->shutdown(ctdb);
1804 /* get a new event context */
1805 talloc_free(ctdb->ev);
1806 ctdb->ev = event_context_init(ctdb);
1808 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
1809 ctdb_recoverd_parent, &fd[0]);
1811 close(ctdb->daemon.sd);
1812 ctdb->daemon.sd = -1;
1814 srandom(getpid() ^ time(NULL));
1816 /* initialise ctdb */
1817 ret = ctdb_socket_connect(ctdb);
1819 DEBUG(0, (__location__ " Failed to init ctdb\n"));
1823 monitor_cluster(ctdb);
1825 DEBUG(0,("ERROR: ctdb_recoverd finished!?\n"));