4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
34 struct ctdb_recoverd *rec;
39 private state of recovery daemon
41 struct ctdb_recoverd {
42 struct ctdb_context *ctdb;
43 uint32_t last_culprit;
44 uint32_t culprit_counter;
45 struct timeval first_recover_time;
46 struct ban_state **banned_nodes;
47 struct timeval priority_time;
48 bool need_takeover_run;
51 struct timed_event *send_election_te;
52 struct timed_event *election_timeout;
55 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
56 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
64 static void async_callback(struct ctdb_client_control_state *state)
66 struct async_data *data = talloc_get_type(state->async.private_data, struct async_data);
70 /* one more node has responded with recmode data */
73 /* if we failed to push the db, then return an error and let
74 the main loop try again.
76 if (state->state != CTDB_CONTROL_DONE) {
77 DEBUG(0,("Async operation failed with state %d\n", state->state));
82 state->async.fn = NULL;
84 ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
85 if ((ret != 0) || (res != 0)) {
86 DEBUG(0,("Async operation failed with ret=%d res=%d\n", ret, (int)res));
92 static void async_add(struct async_data *data, struct ctdb_client_control_state *state)
94 /* set up the callback functions */
95 state->async.fn = async_callback;
96 state->async.private_data = data;
98 /* one more control to wait for to complete */
103 /* wait for up to the maximum number of seconds allowed
104 or until all nodes we expect a response from has replied
106 static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
108 while (data->count > 0) {
109 event_loop_once(ctdb->ev);
111 if (data->fail_count != 0) {
112 DEBUG(0,("Async wait failed - fail_count=%u\n", data->fail_count));
122 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
124 struct ctdb_context *ctdb = rec->ctdb;
126 DEBUG(0,("Unbanning node %u\n", pnn));
128 if (!ctdb_validate_pnn(ctdb, pnn)) {
129 DEBUG(0,("Bad pnn %u in ctdb_unban_node\n", pnn));
133 /* If we are unbanning a different node then just pass the ban info on */
134 if (pnn != ctdb->pnn) {
138 DEBUG(0,("Unanning remote node %u. Passing the ban request on to the remote node.\n", pnn));
140 data.dptr = (uint8_t *)&pnn;
141 data.dsize = sizeof(uint32_t);
143 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_UNBAN_NODE, data);
145 DEBUG(0,("Failed to unban node %u\n", pnn));
152 /* make sure we remember we are no longer banned in case
153 there is an election */
154 rec->node_flags &= ~NODE_FLAGS_BANNED;
156 DEBUG(0,("Clearing ban flag on node %u\n", pnn));
157 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
159 if (rec->banned_nodes[pnn] == NULL) {
160 DEBUG(0,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
164 talloc_free(rec->banned_nodes[pnn]);
165 rec->banned_nodes[pnn] = NULL;
170 called when a ban has timed out
172 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
174 struct ban_state *state = talloc_get_type(p, struct ban_state);
175 struct ctdb_recoverd *rec = state->rec;
176 uint32_t pnn = state->banned_node;
178 DEBUG(0,("Ban timeout. Node %u is now unbanned\n", pnn));
179 ctdb_unban_node(rec, pnn);
183 ban a node for a period of time
185 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
187 struct ctdb_context *ctdb = rec->ctdb;
189 DEBUG(0,("Banning node %u for %u seconds\n", pnn, ban_time));
191 if (!ctdb_validate_pnn(ctdb, pnn)) {
192 DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
196 if (0 == ctdb->tunable.enable_bans) {
197 DEBUG(0,("Bans are disabled - ignoring ban of node %u\n", pnn));
201 /* If we are banning a different node then just pass the ban info on */
202 if (pnn != ctdb->pnn) {
203 struct ctdb_ban_info b;
207 DEBUG(0,("Banning remote node %u for %u seconds. Passing the ban request on to the remote node.\n", pnn, ban_time));
210 b.ban_time = ban_time;
212 data.dptr = (uint8_t *)&b;
213 data.dsize = sizeof(b);
215 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_BAN_NODE, data);
217 DEBUG(0,("Failed to ban node %u\n", pnn));
224 DEBUG(0,("self ban - lowering our election priority\n"));
225 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
227 /* banning ourselves - lower our election priority */
228 rec->priority_time = timeval_current();
230 /* make sure we remember we are banned in case there is an
232 rec->node_flags |= NODE_FLAGS_BANNED;
234 if (rec->banned_nodes[pnn] != NULL) {
235 DEBUG(0,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
236 talloc_free(rec->banned_nodes[pnn]);
237 rec->banned_nodes[pnn] = NULL;
240 rec->banned_nodes[pnn] = talloc(rec->banned_nodes, struct ban_state);
241 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
243 rec->banned_nodes[pnn]->rec = rec;
244 rec->banned_nodes[pnn]->banned_node = pnn;
247 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
248 timeval_current_ofs(ban_time, 0),
249 ctdb_ban_timeout, rec->banned_nodes[pnn]);
253 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
257 perform a simple control on all active nodes. The control cannot return data
259 static int async_control_on_active_nodes(struct ctdb_context *ctdb, enum ctdb_controls opcode,
260 struct ctdb_node_map *nodemap, TDB_DATA data, bool include_self)
262 struct async_data *async_data;
263 struct ctdb_client_control_state *state;
265 struct timeval timeout = CONTROL_TIMEOUT();
267 async_data = talloc_zero(ctdb, struct async_data);
268 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
270 /* loop over all active nodes and send an async control to each of them */
271 for (j=0; j<nodemap->num; j++) {
272 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
275 if (nodemap->nodes[j].pnn == ctdb->pnn && !include_self) {
278 state = ctdb_control_send(ctdb, nodemap->nodes[j].pnn, 0, opcode,
279 0, data, async_data, NULL, &timeout, NULL);
281 DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
282 talloc_free(async_data);
286 async_add(async_data, state);
289 if (async_wait(ctdb, async_data) != 0) {
290 DEBUG(0,(__location__ " Failed async control %u\n", (unsigned)opcode));
291 talloc_free(async_data);
295 talloc_free(async_data);
302 change recovery mode on all nodes
304 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
308 /* freeze all nodes */
309 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
310 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_FREEZE,
311 nodemap, tdb_null, true) != 0) {
312 DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
318 data.dsize = sizeof(uint32_t);
319 data.dptr = (unsigned char *)&rec_mode;
321 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_SET_RECMODE,
322 nodemap, data, true) != 0) {
323 DEBUG(0, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
327 if (rec_mode == CTDB_RECOVERY_NORMAL) {
328 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_THAW,
329 nodemap, tdb_null, true) != 0) {
330 DEBUG(0, (__location__ " Unable to thaw nodes. Recovery failed.\n"));
339 change recovery master on all node
341 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
345 data.dsize = sizeof(uint32_t);
346 data.dptr = (unsigned char *)&pnn;
348 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_SET_RECMASTER,
349 nodemap, data, true) != 0) {
350 DEBUG(0, (__location__ " Unable to set recmaster. Recovery failed.\n"));
359 ensure all other nodes have attached to any databases that we have
361 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
362 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
365 struct ctdb_dbid_map *remote_dbmap;
367 /* verify that all other nodes have all our databases */
368 for (j=0; j<nodemap->num; j++) {
369 /* we dont need to ourself ourselves */
370 if (nodemap->nodes[j].pnn == pnn) {
373 /* dont check nodes that are unavailable */
374 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
378 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
379 mem_ctx, &remote_dbmap);
381 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
385 /* step through all local databases */
386 for (db=0; db<dbmap->num;db++) {
390 for (i=0;i<remote_dbmap->num;i++) {
391 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
395 /* the remote node already have this database */
396 if (i!=remote_dbmap->num) {
399 /* ok so we need to create this database */
400 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
403 DEBUG(0, (__location__ " Unable to get dbname from node %u\n", pnn));
406 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
407 mem_ctx, name, dbmap->dbs[db].persistent);
409 DEBUG(0, (__location__ " Unable to create remote db:%s\n", name));
420 ensure we are attached to any databases that anyone else is attached to
422 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
423 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
426 struct ctdb_dbid_map *remote_dbmap;
428 /* verify that we have all database any other node has */
429 for (j=0; j<nodemap->num; j++) {
430 /* we dont need to ourself ourselves */
431 if (nodemap->nodes[j].pnn == pnn) {
434 /* dont check nodes that are unavailable */
435 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
439 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, &remote_dbmap);
442 DEBUG(0, (__location__ " Unable to get dbids from node %u\n", pnn));
446 /* step through all databases on the remote node */
447 for (db=0; db<remote_dbmap->num;db++) {
450 for (i=0;i<(*dbmap)->num;i++) {
451 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
455 /* we already have this db locally */
456 if (i!=(*dbmap)->num) {
459 /* ok so we need to create this database and
462 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
463 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
465 DEBUG(0, (__location__ " Unable to get dbname from node %u\n",
466 nodemap->nodes[j].pnn));
469 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
470 remote_dbmap->dbs[db].persistent);
472 DEBUG(0, (__location__ " Unable to create local db:%s\n", name));
475 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
477 DEBUG(0, (__location__ " Unable to reread dbmap on node %u\n", pnn));
488 pull the remote database contents from one node into the recdb
490 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
491 struct tdb_wrap *recdb, uint32_t dbid)
495 struct ctdb_control_pulldb_reply *reply;
496 struct ctdb_rec_data *rec;
498 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
500 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
501 CONTROL_TIMEOUT(), &outdata);
503 DEBUG(0,(__location__ " Unable to copy db from node %u\n", srcnode));
504 talloc_free(tmp_ctx);
508 reply = (struct ctdb_control_pulldb_reply *)outdata.dptr;
510 if (outdata.dsize < offsetof(struct ctdb_control_pulldb_reply, data)) {
511 DEBUG(0,(__location__ " invalid data in pulldb reply\n"));
512 talloc_free(tmp_ctx);
516 rec = (struct ctdb_rec_data *)&reply->data[0];
520 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
522 struct ctdb_ltdb_header *hdr;
525 key.dptr = &rec->data[0];
526 key.dsize = rec->keylen;
527 data.dptr = &rec->data[key.dsize];
528 data.dsize = rec->datalen;
530 hdr = (struct ctdb_ltdb_header *)data.dptr;
532 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
533 DEBUG(0,(__location__ " bad ltdb record\n"));
534 talloc_free(tmp_ctx);
538 /* fetch the existing record, if any */
539 existing = tdb_fetch(recdb->tdb, key);
541 if (existing.dptr != NULL) {
542 struct ctdb_ltdb_header header;
543 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
544 DEBUG(0,(__location__ " Bad record size %u from node %u\n",
545 (unsigned)existing.dsize, srcnode));
547 talloc_free(tmp_ctx);
550 header = *(struct ctdb_ltdb_header *)existing.dptr;
552 if (!(header.rsn < hdr->rsn ||
553 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
558 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
559 DEBUG(0,(__location__ " Failed to store record\n"));
560 talloc_free(tmp_ctx);
565 talloc_free(tmp_ctx);
571 pull all the remote database contents into the recdb
573 static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
574 struct tdb_wrap *recdb, uint32_t dbid)
578 /* pull all records from all other nodes across onto this node
579 (this merges based on rsn)
581 for (j=0; j<nodemap->num; j++) {
582 /* dont merge from nodes that are unavailable */
583 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
586 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
587 DEBUG(0,(__location__ " Failed to pull remote database from node %u\n",
588 nodemap->nodes[j].pnn));
598 update flags on all active nodes
600 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
603 for (i=0;i<nodemap->num;i++) {
604 struct ctdb_node_flag_change c;
607 c.pnn = nodemap->nodes[i].pnn;
608 c.old_flags = nodemap->nodes[i].flags;
609 c.new_flags = nodemap->nodes[i].flags;
611 data.dptr = (uint8_t *)&c;
612 data.dsize = sizeof(c);
614 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
615 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
623 ensure all nodes have the same vnnmap we do
625 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
626 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
630 /* push the new vnn map out to all the nodes */
631 for (j=0; j<nodemap->num; j++) {
632 /* dont push to nodes that are unavailable */
633 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
637 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
639 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
649 handler for when the admin bans a node
651 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
652 TDB_DATA data, void *private_data)
654 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
655 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
656 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
658 if (data.dsize != sizeof(*b)) {
659 DEBUG(0,("Bad data in ban_handler\n"));
660 talloc_free(mem_ctx);
664 if (b->pnn != ctdb->pnn) {
665 DEBUG(0,("Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request\n", b->pnn, ctdb->pnn));
669 DEBUG(0,("Node %u has been banned for %u seconds\n",
670 b->pnn, b->ban_time));
672 ctdb_ban_node(rec, b->pnn, b->ban_time);
673 talloc_free(mem_ctx);
677 handler for when the admin unbans a node
679 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
680 TDB_DATA data, void *private_data)
682 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
683 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
686 if (data.dsize != sizeof(uint32_t)) {
687 DEBUG(0,("Bad data in unban_handler\n"));
688 talloc_free(mem_ctx);
691 pnn = *(uint32_t *)data.dptr;
693 if (pnn != ctdb->pnn) {
694 DEBUG(0,("Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request\n", pnn, ctdb->pnn));
698 DEBUG(0,("Node %u has been unbanned.\n", pnn));
699 ctdb_unban_node(rec, pnn);
700 talloc_free(mem_ctx);
706 called when ctdb_wait_timeout should finish
708 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
709 struct timeval yt, void *p)
711 uint32_t *timed_out = (uint32_t *)p;
716 wait for a given number of seconds
718 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
720 uint32_t timed_out = 0;
721 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
723 event_loop_once(ctdb->ev);
728 called when an election times out (ends)
730 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
731 struct timeval t, void *p)
733 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
734 rec->election_timeout = NULL;
739 wait for an election to finish. It finished election_timeout seconds after
740 the last election packet is received
742 static void ctdb_wait_election(struct ctdb_recoverd *rec)
744 struct ctdb_context *ctdb = rec->ctdb;
745 while (rec->election_timeout) {
746 event_loop_once(ctdb->ev);
751 remember the trouble maker
753 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
755 struct ctdb_context *ctdb = rec->ctdb;
757 if (rec->last_culprit != culprit ||
758 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
759 DEBUG(0,("New recovery culprit %u\n", culprit));
760 /* either a new node is the culprit, or we've decided to forgive them */
761 rec->last_culprit = culprit;
762 rec->first_recover_time = timeval_current();
763 rec->culprit_counter = 0;
765 rec->culprit_counter++;
769 Update our local flags from all remote connected nodes.
770 This is only run when we are or we belive we are the recovery master
772 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
775 struct ctdb_context *ctdb = rec->ctdb;
776 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
778 /* get the nodemap for all active remote nodes and verify
779 they are the same as for this node
781 for (j=0; j<nodemap->num; j++) {
782 struct ctdb_node_map *remote_nodemap=NULL;
785 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
788 if (nodemap->nodes[j].pnn == ctdb->pnn) {
792 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
793 mem_ctx, &remote_nodemap);
795 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
796 nodemap->nodes[j].pnn));
797 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
798 talloc_free(mem_ctx);
799 return MONITOR_FAILED;
801 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
802 struct ctdb_node_flag_change c;
805 /* We should tell our daemon about this so it
806 updates its flags or else we will log the same
807 message again in the next iteration of recovery.
808 Since we are the recovery master we can just as
809 well update the flags on all nodes.
811 c.pnn = nodemap->nodes[j].pnn;
812 c.old_flags = nodemap->nodes[j].flags;
813 c.new_flags = remote_nodemap->nodes[j].flags;
815 data.dptr = (uint8_t *)&c;
816 data.dsize = sizeof(c);
818 ctdb_send_message(ctdb, ctdb->pnn,
819 CTDB_SRVID_NODE_FLAGS_CHANGED,
822 /* Update our local copy of the flags in the recovery
825 DEBUG(0,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
826 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
827 nodemap->nodes[j].flags));
828 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
830 /* If the BANNED flag has changed for the node
831 this is a good reason to do a new election.
833 if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) {
834 DEBUG(0,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
835 nodemap->nodes[j].pnn, c.new_flags,
837 talloc_free(mem_ctx);
838 return MONITOR_ELECTION_NEEDED;
842 talloc_free(remote_nodemap);
844 talloc_free(mem_ctx);
849 /* Create a new random generation ip.
850 The generation id can not be the INVALID_GENERATION id
852 static uint32_t new_generation(void)
857 generation = random();
859 if (generation != INVALID_GENERATION) {
869 create a temporary working database
871 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
874 struct tdb_wrap *recdb;
876 /* open up the temporary recovery database */
877 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
882 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
883 TDB_NOLOCK, O_RDWR|O_CREAT|O_EXCL, 0600);
885 DEBUG(0,(__location__ " Failed to create temp recovery database '%s'\n", name));
895 a traverse function for pulling all relevent records from recdb
898 struct ctdb_context *ctdb;
899 struct ctdb_control_pulldb_reply *recdata;
904 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
906 struct recdb_data *params = (struct recdb_data *)p;
907 struct ctdb_rec_data *rec;
908 struct ctdb_ltdb_header *hdr;
910 /* skip empty records */
911 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
915 /* update the dmaster field to point to us */
916 hdr = (struct ctdb_ltdb_header *)data.dptr;
917 hdr->dmaster = params->ctdb->pnn;
919 /* add the record to the blob ready to send to the nodes */
920 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
922 params->failed = true;
925 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
926 if (params->recdata == NULL) {
927 DEBUG(0,(__location__ " Failed to expand recdata to %u (%u records)\n",
928 rec->length + params->len, params->recdata->count));
929 params->failed = true;
932 params->recdata->count++;
933 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
934 params->len += rec->length;
941 push the recdb database out to all nodes
943 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
944 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
946 struct recdb_data params;
947 struct ctdb_control_pulldb_reply *recdata;
950 recdata = talloc_zero(recdb, struct ctdb_control_pulldb_reply);
951 CTDB_NO_MEMORY(ctdb, recdata);
953 recdata->db_id = dbid;
956 params.recdata = recdata;
957 params.len = offsetof(struct ctdb_control_pulldb_reply, data);
958 params.failed = false;
960 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
961 DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
962 talloc_free(params.recdata);
967 DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
968 talloc_free(params.recdata);
972 recdata = params.recdata;
974 outdata.dptr = (void *)recdata;
975 outdata.dsize = params.len;
977 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_PUSH_DB, nodemap, outdata, true) != 0) {
978 DEBUG(0,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
979 talloc_free(recdata);
983 DEBUG(0, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
984 dbid, recdata->count));
986 talloc_free(recdata);
993 go through a full recovery on one database
995 static int recover_database(struct ctdb_recoverd *rec,
999 struct ctdb_node_map *nodemap,
1000 uint32_t transaction_id)
1002 struct tdb_wrap *recdb;
1004 struct ctdb_context *ctdb = rec->ctdb;
1006 struct ctdb_control_wipe_database w;
1008 recdb = create_recdb(ctdb, mem_ctx);
1009 if (recdb == NULL) {
1013 /* pull all remote databases onto the recdb */
1014 ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
1016 DEBUG(0, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1020 DEBUG(0, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1022 /* wipe all the remote databases. This is safe as we are in a transaction */
1024 w.transaction_id = transaction_id;
1026 data.dptr = (void *)&w;
1027 data.dsize = sizeof(w);
1029 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1030 nodemap, data, true) != 0) {
1031 DEBUG(0, (__location__ " Unable to wipe database. Recovery failed.\n"));
1035 /* push out the correct database. This sets the dmaster and skips
1036 the empty records */
1037 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1043 /* all done with this database */
1051 we are the recmaster, and recovery is needed - start a recovery run
1053 static int do_recovery(struct ctdb_recoverd *rec,
1054 TALLOC_CTX *mem_ctx, uint32_t pnn, uint32_t num_active,
1055 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
1058 struct ctdb_context *ctdb = rec->ctdb;
1060 uint32_t generation;
1061 struct ctdb_dbid_map *dbmap;
1064 DEBUG(0, (__location__ " Starting do_recovery\n"));
1066 /* if recovery fails, force it again */
1067 rec->need_recovery = true;
1069 ctdb_set_culprit(rec, culprit);
1071 if (rec->culprit_counter > 2*nodemap->num) {
1072 DEBUG(0,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
1073 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
1074 ctdb->tunable.recovery_ban_period));
1075 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
1078 if (!ctdb_recovery_lock(ctdb, true)) {
1079 ctdb_set_culprit(rec, pnn);
1080 DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
1084 DEBUG(0, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
1086 /* get a list of all databases */
1087 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1089 DEBUG(0, (__location__ " Unable to get dbids from node :%u\n", pnn));
1093 /* we do the db creation before we set the recovery mode, so the freeze happens
1094 on all databases we will be dealing with. */
1096 /* verify that we have all the databases any other node has */
1097 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1099 DEBUG(0, (__location__ " Unable to create missing local databases\n"));
1103 /* verify that all other nodes have all our databases */
1104 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1106 DEBUG(0, (__location__ " Unable to create missing remote databases\n"));
1110 DEBUG(0, (__location__ " Recovery - created remote databases\n"));
1112 /* set recovery mode to active on all nodes */
1113 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1115 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1119 /* pick a new generation number */
1120 generation = new_generation();
1122 /* change the vnnmap on this node to use the new generation
1123 number but not on any other nodes.
1124 this guarantees that if we abort the recovery prematurely
1125 for some reason (a node stops responding?)
1126 that we can just return immediately and we will reenter
1127 recovery shortly again.
1128 I.e. we deliberately leave the cluster with an inconsistent
1129 generation id to allow us to abort recovery at any stage and
1130 just restart it from scratch.
1132 vnnmap->generation = generation;
1133 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1135 DEBUG(0, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1139 data.dptr = (void *)&generation;
1140 data.dsize = sizeof(uint32_t);
1142 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_TRANSACTION_START,
1143 nodemap, data, true) != 0) {
1144 DEBUG(0, (__location__ " Unable to start transactions. Recovery failed.\n"));
1148 DEBUG(0,(__location__ " started transactions on all nodes\n"));
1150 for (i=0;i<dbmap->num;i++) {
1151 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1152 DEBUG(0, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1157 DEBUG(0, (__location__ " Recovery - starting database commits\n"));
1159 /* commit all the changes */
1160 if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1161 nodemap, data, true) != 0) {
1162 DEBUG(0, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1166 DEBUG(0, (__location__ " Recovery - committed databases\n"));
1169 /* build a new vnn map with all the currently active and
1171 generation = new_generation();
1172 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1173 CTDB_NO_MEMORY(ctdb, vnnmap);
1174 vnnmap->generation = generation;
1175 vnnmap->size = num_active;
1176 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1177 for (i=j=0;i<nodemap->num;i++) {
1178 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1179 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1183 /* update to the new vnnmap on all nodes */
1184 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1186 DEBUG(0, (__location__ " Unable to update vnnmap on all nodes\n"));
1190 DEBUG(0, (__location__ " Recovery - updated vnnmap\n"));
1192 /* update recmaster to point to us for all nodes */
1193 ret = set_recovery_master(ctdb, nodemap, pnn);
1195 DEBUG(0, (__location__ " Unable to set recovery master\n"));
1199 DEBUG(0, (__location__ " Recovery - updated recmaster\n"));
1202 update all nodes to have the same flags that we have
1204 ret = update_flags_on_all_nodes(ctdb, nodemap);
1206 DEBUG(0, (__location__ " Unable to update flags on all nodes\n"));
1210 DEBUG(0, (__location__ " Recovery - updated flags\n"));
1213 if enabled, tell nodes to takeover their public IPs
1216 rec->need_takeover_run = false;
1217 ret = ctdb_takeover_run(ctdb, nodemap);
1219 DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
1222 DEBUG(1, (__location__ " Recovery - done takeover\n"));
1225 /* disable recovery mode */
1226 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
1228 DEBUG(0, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1232 /* send a message to all clients telling them that the cluster
1233 has been reconfigured */
1234 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1236 DEBUG(0, (__location__ " Recovery complete\n"));
1238 rec->need_recovery = false;
1240 /* We just finished a recovery successfully.
1241 We now wait for rerecovery_timeout before we allow
1242 another recovery to take place.
1244 DEBUG(0, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1245 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1246 DEBUG(0, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1253 elections are won by first checking the number of connected nodes, then
1254 the priority time, then the pnn
1256 struct election_message {
1257 uint32_t num_connected;
1258 struct timeval priority_time;
1260 uint32_t node_flags;
1264 form this nodes election data
1266 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1269 struct ctdb_node_map *nodemap;
1270 struct ctdb_context *ctdb = rec->ctdb;
1274 em->pnn = rec->ctdb->pnn;
1275 em->priority_time = rec->priority_time;
1276 em->node_flags = rec->node_flags;
1278 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1280 DEBUG(0,(__location__ " unable to get election data\n"));
1284 for (i=0;i<nodemap->num;i++) {
1285 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1286 em->num_connected++;
1289 talloc_free(nodemap);
1293 see if the given election data wins
1295 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1297 struct election_message myem;
1300 ctdb_election_data(rec, &myem);
1302 /* we cant win if we are banned */
1303 if (rec->node_flags & NODE_FLAGS_BANNED) {
1307 /* we will automatically win if the other node is banned */
1308 if (em->node_flags & NODE_FLAGS_BANNED) {
1312 /* try to use the most connected node */
1314 cmp = (int)myem.num_connected - (int)em->num_connected;
1317 /* then the longest running node */
1319 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1323 cmp = (int)myem.pnn - (int)em->pnn;
1330 send out an election request
1332 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1335 TDB_DATA election_data;
1336 struct election_message emsg;
1338 struct ctdb_context *ctdb = rec->ctdb;
1340 srvid = CTDB_SRVID_RECOVERY;
1342 ctdb_election_data(rec, &emsg);
1344 election_data.dsize = sizeof(struct election_message);
1345 election_data.dptr = (unsigned char *)&emsg;
1348 /* first we assume we will win the election and set
1349 recoverymaster to be ourself on the current node
1351 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1353 DEBUG(0, (__location__ " failed to send recmaster election request\n"));
1358 /* send an election message to all active nodes */
1359 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1365 this function will unban all nodes in the cluster
1367 static void unban_all_nodes(struct ctdb_context *ctdb)
1370 struct ctdb_node_map *nodemap;
1371 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1373 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1375 DEBUG(0,(__location__ " failed to get nodemap to unban all nodes\n"));
1379 for (i=0;i<nodemap->num;i++) {
1380 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1381 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1382 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1386 talloc_free(tmp_ctx);
1391 we think we are winning the election - send a broadcast election request
1393 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1395 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1398 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1400 DEBUG(0,("Failed to send election request!\n"));
1403 talloc_free(rec->send_election_te);
1404 rec->send_election_te = NULL;
1408 handler for recovery master elections
1410 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1411 TDB_DATA data, void *private_data)
1413 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1415 struct election_message *em = (struct election_message *)data.dptr;
1416 TALLOC_CTX *mem_ctx;
1418 /* we got an election packet - update the timeout for the election */
1419 talloc_free(rec->election_timeout);
1420 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1421 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1422 ctdb_election_timeout, rec);
1424 mem_ctx = talloc_new(ctdb);
1426 /* someone called an election. check their election data
1427 and if we disagree and we would rather be the elected node,
1428 send a new election message to all other nodes
1430 if (ctdb_election_win(rec, em)) {
1431 if (!rec->send_election_te) {
1432 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1433 timeval_current_ofs(0, 500000),
1434 election_send_request, rec);
1436 talloc_free(mem_ctx);
1437 /*unban_all_nodes(ctdb);*/
1442 talloc_free(rec->send_election_te);
1443 rec->send_election_te = NULL;
1445 /* release the recmaster lock */
1446 if (em->pnn != ctdb->pnn &&
1447 ctdb->recovery_lock_fd != -1) {
1448 close(ctdb->recovery_lock_fd);
1449 ctdb->recovery_lock_fd = -1;
1450 unban_all_nodes(ctdb);
1453 /* ok, let that guy become recmaster then */
1454 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1456 DEBUG(0, (__location__ " failed to send recmaster election request"));
1457 talloc_free(mem_ctx);
1461 /* release any bans */
1462 rec->last_culprit = (uint32_t)-1;
1463 talloc_free(rec->banned_nodes);
1464 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1465 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1467 talloc_free(mem_ctx);
1473 force the start of the election process
1475 static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t pnn,
1476 struct ctdb_node_map *nodemap)
1479 struct ctdb_context *ctdb = rec->ctdb;
1481 /* set all nodes to recovery mode to stop all internode traffic */
1482 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1484 DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n"));
1488 talloc_free(rec->election_timeout);
1489 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1490 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1491 ctdb_election_timeout, rec);
1493 ret = send_election_request(rec, pnn);
1495 DEBUG(0, (__location__ " failed to initiate recmaster election"));
1499 /* wait for a few seconds to collect all responses */
1500 ctdb_wait_election(rec);
1506 handler for when a node changes its flags
1508 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1509 TDB_DATA data, void *private_data)
1512 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1513 struct ctdb_node_map *nodemap=NULL;
1514 TALLOC_CTX *tmp_ctx;
1515 uint32_t changed_flags;
1517 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1519 if (data.dsize != sizeof(*c)) {
1520 DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1524 tmp_ctx = talloc_new(ctdb);
1525 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1527 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1529 DEBUG(0,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1530 talloc_free(tmp_ctx);
1535 for (i=0;i<nodemap->num;i++) {
1536 if (nodemap->nodes[i].pnn == c->pnn) break;
1539 if (i == nodemap->num) {
1540 DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1541 talloc_free(tmp_ctx);
1545 changed_flags = c->old_flags ^ c->new_flags;
1547 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1548 This flag is handled locally based on whether the local node
1549 can communicate with the node or not.
1551 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1552 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1553 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1556 if (nodemap->nodes[i].flags != c->new_flags) {
1557 DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1560 nodemap->nodes[i].flags = c->new_flags;
1562 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1563 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1566 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1567 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1571 ctdb->recovery_master == ctdb->pnn &&
1572 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
1574 /* Only do the takeover run if the perm disabled or unhealthy
1575 flags changed since these will cause an ip failover but not
1577 If the node became disconnected or banned this will also
1578 lead to an ip address failover but that is handled
1581 if (changed_flags & NODE_FLAGS_DISABLED) {
1582 rec->need_takeover_run = true;
1586 talloc_free(tmp_ctx);
1591 struct verify_recmode_normal_data {
1593 enum monitor_result status;
1596 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1598 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1601 /* one more node has responded with recmode data*/
1604 /* if we failed to get the recmode, then return an error and let
1605 the main loop try again.
1607 if (state->state != CTDB_CONTROL_DONE) {
1608 if (rmdata->status == MONITOR_OK) {
1609 rmdata->status = MONITOR_FAILED;
1614 /* if we got a response, then the recmode will be stored in the
1617 if (state->status != CTDB_RECOVERY_NORMAL) {
1618 DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1619 rmdata->status = MONITOR_RECOVERY_NEEDED;
1626 /* verify that all nodes are in normal recovery mode */
1627 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1629 struct verify_recmode_normal_data *rmdata;
1630 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1631 struct ctdb_client_control_state *state;
1632 enum monitor_result status;
1635 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1636 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1638 rmdata->status = MONITOR_OK;
1640 /* loop over all active nodes and send an async getrecmode call to
1642 for (j=0; j<nodemap->num; j++) {
1643 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1646 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1648 nodemap->nodes[j].pnn);
1649 if (state == NULL) {
1650 /* we failed to send the control, treat this as
1651 an error and try again next iteration
1653 DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1654 talloc_free(mem_ctx);
1655 return MONITOR_FAILED;
1658 /* set up the callback functions */
1659 state->async.fn = verify_recmode_normal_callback;
1660 state->async.private_data = rmdata;
1662 /* one more control to wait for to complete */
1667 /* now wait for up to the maximum number of seconds allowed
1668 or until all nodes we expect a response from has replied
1670 while (rmdata->count > 0) {
1671 event_loop_once(ctdb->ev);
1674 status = rmdata->status;
1675 talloc_free(mem_ctx);
1680 struct verify_recmaster_data {
1683 enum monitor_result status;
1686 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1688 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1691 /* one more node has responded with recmaster data*/
1694 /* if we failed to get the recmaster, then return an error and let
1695 the main loop try again.
1697 if (state->state != CTDB_CONTROL_DONE) {
1698 if (rmdata->status == MONITOR_OK) {
1699 rmdata->status = MONITOR_FAILED;
1704 /* if we got a response, then the recmaster will be stored in the
1707 if (state->status != rmdata->pnn) {
1708 DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
1709 rmdata->status = MONITOR_ELECTION_NEEDED;
1716 /* verify that all nodes agree that we are the recmaster */
1717 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
1719 struct verify_recmaster_data *rmdata;
1720 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1721 struct ctdb_client_control_state *state;
1722 enum monitor_result status;
1725 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1726 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1729 rmdata->status = MONITOR_OK;
1731 /* loop over all active nodes and send an async getrecmaster call to
1733 for (j=0; j<nodemap->num; j++) {
1734 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1737 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1739 nodemap->nodes[j].pnn);
1740 if (state == NULL) {
1741 /* we failed to send the control, treat this as
1742 an error and try again next iteration
1744 DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1745 talloc_free(mem_ctx);
1746 return MONITOR_FAILED;
1749 /* set up the callback functions */
1750 state->async.fn = verify_recmaster_callback;
1751 state->async.private_data = rmdata;
1753 /* one more control to wait for to complete */
1758 /* now wait for up to the maximum number of seconds allowed
1759 or until all nodes we expect a response from has replied
1761 while (rmdata->count > 0) {
1762 event_loop_once(ctdb->ev);
1765 status = rmdata->status;
1766 talloc_free(mem_ctx);
1772 the main monitoring loop
1774 static void monitor_cluster(struct ctdb_context *ctdb)
1776 uint32_t pnn, num_active, recmaster;
1777 TALLOC_CTX *mem_ctx=NULL;
1778 struct ctdb_node_map *nodemap=NULL;
1779 struct ctdb_node_map *remote_nodemap=NULL;
1780 struct ctdb_vnn_map *vnnmap=NULL;
1781 struct ctdb_vnn_map *remote_vnnmap=NULL;
1783 struct ctdb_recoverd *rec;
1784 struct ctdb_all_public_ips *ips;
1787 DEBUG(0,("monitor_cluster starting\n"));
1789 rec = talloc_zero(ctdb, struct ctdb_recoverd);
1790 CTDB_NO_MEMORY_FATAL(ctdb, rec);
1793 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1794 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1796 rec->priority_time = timeval_current();
1798 /* register a message port for recovery elections */
1799 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
1801 /* and one for when nodes are disabled/enabled */
1802 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
1804 /* and one for when nodes are banned */
1805 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
1807 /* and one for when nodes are unbanned */
1808 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
1812 talloc_free(mem_ctx);
1815 mem_ctx = talloc_new(ctdb);
1817 DEBUG(0,("Failed to create temporary context\n"));
1821 /* we only check for recovery once every second */
1822 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
1824 if (rec->election_timeout) {
1825 /* an election is in progress */
1830 /* We must check if we need to ban a node here but we want to do this
1831 as early as possible so we dont wait until we have pulled the node
1832 map from the local node. thats why we have the hardcoded value 20
1834 if (rec->culprit_counter > 20) {
1835 DEBUG(0,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
1836 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
1837 ctdb->tunable.recovery_ban_period));
1838 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
1841 /* get relevant tunables */
1842 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
1844 DEBUG(0,("Failed to get tunables - retrying\n"));
1848 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1849 if (pnn == (uint32_t)-1) {
1850 DEBUG(0,("Failed to get local pnn - retrying\n"));
1854 /* get the vnnmap */
1855 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
1857 DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", pnn));
1862 /* get number of nodes */
1863 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &nodemap);
1865 DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", pnn));
1869 /* check which node is the recovery master */
1870 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &recmaster);
1872 DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", pnn));
1876 if (recmaster == (uint32_t)-1) {
1877 DEBUG(0,(__location__ " Initial recovery master set - forcing election\n"));
1878 force_election(rec, mem_ctx, pnn, nodemap);
1882 /* check that we (recovery daemon) and the local ctdb daemon
1883 agrees on whether we are banned or not
1885 if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
1886 if (rec->banned_nodes[pnn] == NULL) {
1887 if (recmaster == pnn) {
1888 DEBUG(0,("Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
1890 ctdb_unban_node(rec, pnn);
1892 DEBUG(0,("Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
1893 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1894 ctdb_set_culprit(rec, pnn);
1899 if (rec->banned_nodes[pnn] != NULL) {
1900 if (recmaster == pnn) {
1901 DEBUG(0,("Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
1903 ctdb_unban_node(rec, pnn);
1905 DEBUG(0,("Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
1907 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1908 ctdb_set_culprit(rec, pnn);
1914 /* remember our own node flags */
1915 rec->node_flags = nodemap->nodes[pnn].flags;
1917 /* count how many active nodes there are */
1919 for (i=0; i<nodemap->num; i++) {
1920 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
1926 /* verify that the recmaster node is still active */
1927 for (j=0; j<nodemap->num; j++) {
1928 if (nodemap->nodes[j].pnn==recmaster) {
1933 if (j == nodemap->num) {
1934 DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster));
1935 force_election(rec, mem_ctx, pnn, nodemap);
1939 /* if recovery master is disconnected we must elect a new recmaster */
1940 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1941 DEBUG(0, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
1942 force_election(rec, mem_ctx, pnn, nodemap);
1946 /* grap the nodemap from the recovery master to check if it is banned */
1947 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1948 mem_ctx, &remote_nodemap);
1950 DEBUG(0, (__location__ " Unable to get nodemap from recovery master %u\n",
1951 nodemap->nodes[j].pnn));
1956 if (remote_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1957 DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
1958 force_election(rec, mem_ctx, pnn, nodemap);
1962 /* verify that the public ip address allocation is consistent */
1963 if (ctdb->vnn != NULL) {
1964 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
1966 DEBUG(0, ("Unable to get public ips from node %u\n", i));
1969 for (j=0; j<ips->num; j++) {
1970 /* verify that we have the ip addresses we should have
1971 and we dont have ones we shouldnt have.
1972 if we find an inconsistency we set recmode to
1973 active on the local node and wait for the recmaster
1974 to do a full blown recovery
1976 if (ips->ips[j].pnn == pnn) {
1977 if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
1978 DEBUG(0,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
1979 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1981 DEBUG(0,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
1984 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
1986 DEBUG(0,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
1991 if (ctdb_sys_have_ip(ips->ips[j].sin)) {
1992 DEBUG(0,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
1993 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
1995 DEBUG(0,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
1998 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2000 DEBUG(0,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2008 /* if we are not the recmaster then we do not need to check
2009 if recovery is needed
2011 if (pnn != recmaster) {
2016 /* ensure our local copies of flags are right */
2017 ret = update_local_flags(rec, nodemap);
2018 if (ret == MONITOR_ELECTION_NEEDED) {
2019 DEBUG(0,("update_local_flags() called for a re-election.\n"));
2020 force_election(rec, mem_ctx, pnn, nodemap);
2023 if (ret != MONITOR_OK) {
2024 DEBUG(0,("Unable to update local flags\n"));
2028 /* update the list of public ips that a node can handle for
2031 for (j=0; j<nodemap->num; j++) {
2032 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2035 /* release any existing data */
2036 if (ctdb->nodes[j]->public_ips) {
2037 talloc_free(ctdb->nodes[j]->public_ips);
2038 ctdb->nodes[j]->public_ips = NULL;
2040 /* grab a new shiny list of public ips from the node */
2041 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2042 ctdb->nodes[j]->pnn,
2044 &ctdb->nodes[j]->public_ips)) {
2045 DEBUG(0,("Failed to read public ips from node : %u\n",
2046 ctdb->nodes[j]->pnn));
2052 /* verify that all active nodes agree that we are the recmaster */
2053 switch (verify_recmaster(ctdb, nodemap, pnn)) {
2054 case MONITOR_RECOVERY_NEEDED:
2055 /* can not happen */
2057 case MONITOR_ELECTION_NEEDED:
2058 force_election(rec, mem_ctx, pnn, nodemap);
2062 case MONITOR_FAILED:
2067 if (rec->need_recovery) {
2068 /* a previous recovery didn't finish */
2069 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2073 /* verify that all active nodes are in normal mode
2074 and not in recovery mode
2076 switch (verify_recmode(ctdb, nodemap)) {
2077 case MONITOR_RECOVERY_NEEDED:
2078 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2080 case MONITOR_FAILED:
2082 case MONITOR_ELECTION_NEEDED:
2083 /* can not happen */
2089 /* we should have the reclock - check its not stale */
2090 if (ctdb->recovery_lock_fd == -1) {
2091 DEBUG(0,("recovery master doesn't have the recovery lock\n"));
2092 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2096 if (read(ctdb->recovery_lock_fd, &c, 1) == -1) {
2097 DEBUG(0,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2098 close(ctdb->recovery_lock_fd);
2099 ctdb->recovery_lock_fd = -1;
2100 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2104 /* get the nodemap for all active remote nodes and verify
2105 they are the same as for this node
2107 for (j=0; j<nodemap->num; j++) {
2108 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2111 if (nodemap->nodes[j].pnn == pnn) {
2115 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2116 mem_ctx, &remote_nodemap);
2118 DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
2119 nodemap->nodes[j].pnn));
2123 /* if the nodes disagree on how many nodes there are
2124 then this is a good reason to try recovery
2126 if (remote_nodemap->num != nodemap->num) {
2127 DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2128 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
2129 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2133 /* if the nodes disagree on which nodes exist and are
2134 active, then that is also a good reason to do recovery
2136 for (i=0;i<nodemap->num;i++) {
2137 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
2138 DEBUG(0, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2139 nodemap->nodes[j].pnn, i,
2140 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
2141 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2142 vnnmap, nodemap->nodes[j].pnn);
2145 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
2146 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2147 DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
2148 nodemap->nodes[j].pnn, i,
2149 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
2150 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2151 vnnmap, nodemap->nodes[j].pnn);
2159 /* there better be the same number of lmasters in the vnn map
2160 as there are active nodes or we will have to do a recovery
2162 if (vnnmap->size != num_active) {
2163 DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
2164 vnnmap->size, num_active));
2165 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, ctdb->pnn);
2169 /* verify that all active nodes in the nodemap also exist in
2172 for (j=0; j<nodemap->num; j++) {
2173 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2176 if (nodemap->nodes[j].pnn == pnn) {
2180 for (i=0; i<vnnmap->size; i++) {
2181 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2185 if (i == vnnmap->size) {
2186 DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2187 nodemap->nodes[j].pnn));
2188 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2194 /* verify that all other nodes have the same vnnmap
2195 and are from the same generation
2197 for (j=0; j<nodemap->num; j++) {
2198 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2201 if (nodemap->nodes[j].pnn == pnn) {
2205 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2206 mem_ctx, &remote_vnnmap);
2208 DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n",
2209 nodemap->nodes[j].pnn));
2213 /* verify the vnnmap generation is the same */
2214 if (vnnmap->generation != remote_vnnmap->generation) {
2215 DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2216 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2217 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2221 /* verify the vnnmap size is the same */
2222 if (vnnmap->size != remote_vnnmap->size) {
2223 DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2224 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2225 do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
2229 /* verify the vnnmap is the same */
2230 for (i=0;i<vnnmap->size;i++) {
2231 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2232 DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n",
2233 nodemap->nodes[j].pnn));
2234 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2235 vnnmap, nodemap->nodes[j].pnn);
2241 /* we might need to change who has what IP assigned */
2242 if (rec->need_takeover_run) {
2243 rec->need_takeover_run = false;
2244 ret = ctdb_takeover_run(ctdb, nodemap);
2246 DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
2247 do_recovery(rec, mem_ctx, pnn, num_active, nodemap,
2257 event handler for when the main ctdbd dies
2259 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
2260 uint16_t flags, void *private_data)
2262 DEBUG(0,("recovery daemon parent died - exiting\n"));
2267 startup the recovery daemon as a child of the main ctdb daemon
2269 int ctdb_start_recoverd(struct ctdb_context *ctdb)
2274 if (pipe(fd) != 0) {
2278 ctdb->recoverd_pid = fork();
2279 if (ctdb->recoverd_pid == -1) {
2283 if (ctdb->recoverd_pid != 0) {
2290 /* shutdown the transport */
2291 ctdb->methods->shutdown(ctdb);
2293 /* get a new event context */
2294 talloc_free(ctdb->ev);
2295 ctdb->ev = event_context_init(ctdb);
2297 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2298 ctdb_recoverd_parent, &fd[0]);
2300 close(ctdb->daemon.sd);
2301 ctdb->daemon.sd = -1;
2303 srandom(getpid() ^ time(NULL));
2305 /* initialise ctdb */
2306 ret = ctdb_socket_connect(ctdb);
2308 DEBUG(0, (__location__ " Failed to init ctdb\n"));
2312 monitor_cluster(ctdb);
2314 DEBUG(0,("ERROR: ctdb_recoverd finished!?\n"));
2319 shutdown the recovery daemon
2321 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
2323 if (ctdb->recoverd_pid == 0) {
2327 DEBUG(0,("Shutting down recovery daemon\n"));
2328 kill(ctdb->recoverd_pid, SIGTERM);