4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
35 struct ctdb_recoverd *rec;
40 private state of recovery daemon
42 struct ctdb_recoverd {
43 struct ctdb_context *ctdb;
46 uint32_t num_connected;
47 struct ctdb_node_map *nodemap;
48 uint32_t last_culprit;
49 uint32_t culprit_counter;
50 struct timeval first_recover_time;
51 struct ban_state **banned_nodes;
52 struct timeval priority_time;
53 bool need_takeover_run;
56 struct timed_event *send_election_te;
57 struct timed_event *election_timeout;
58 struct vacuum_info *vacuum_info;
61 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
62 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
68 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
70 struct ctdb_context *ctdb = rec->ctdb;
72 DEBUG(DEBUG_NOTICE,("Unbanning node %u\n", pnn));
74 if (!ctdb_validate_pnn(ctdb, pnn)) {
75 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_unban_node\n", pnn));
79 /* If we are unbanning a different node then just pass the ban info on */
80 if (pnn != ctdb->pnn) {
84 DEBUG(DEBUG_NOTICE,("Unanning remote node %u. Passing the ban request on to the remote node.\n", pnn));
86 data.dptr = (uint8_t *)&pnn;
87 data.dsize = sizeof(uint32_t);
89 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_UNBAN_NODE, data);
91 DEBUG(DEBUG_ERR,("Failed to unban node %u\n", pnn));
98 /* make sure we remember we are no longer banned in case
99 there is an election */
100 rec->node_flags &= ~NODE_FLAGS_BANNED;
102 DEBUG(DEBUG_INFO,("Clearing ban flag on node %u\n", pnn));
103 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
105 if (rec->banned_nodes[pnn] == NULL) {
106 DEBUG(DEBUG_INFO,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
110 talloc_free(rec->banned_nodes[pnn]);
111 rec->banned_nodes[pnn] = NULL;
116 called when a ban has timed out
118 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
120 struct ban_state *state = talloc_get_type(p, struct ban_state);
121 struct ctdb_recoverd *rec = state->rec;
122 uint32_t pnn = state->banned_node;
124 DEBUG(DEBUG_NOTICE,("Ban timeout. Node %u is now unbanned\n", pnn));
125 ctdb_unban_node(rec, pnn);
129 ban a node for a period of time
131 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
133 struct ctdb_context *ctdb = rec->ctdb;
135 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
137 if (!ctdb_validate_pnn(ctdb, pnn)) {
138 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
142 if (0 == ctdb->tunable.enable_bans) {
143 DEBUG(DEBUG_INFO,("Bans are disabled - ignoring ban of node %u\n", pnn));
147 /* If we are banning a different node then just pass the ban info on */
148 if (pnn != ctdb->pnn) {
149 struct ctdb_ban_info b;
153 DEBUG(DEBUG_NOTICE,("Banning remote node %u for %u seconds. Passing the ban request on to the remote node.\n", pnn, ban_time));
156 b.ban_time = ban_time;
158 data.dptr = (uint8_t *)&b;
159 data.dsize = sizeof(b);
161 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_BAN_NODE, data);
163 DEBUG(DEBUG_ERR,("Failed to ban node %u\n", pnn));
170 DEBUG(DEBUG_NOTICE,("self ban - lowering our election priority\n"));
171 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
173 /* banning ourselves - lower our election priority */
174 rec->priority_time = timeval_current();
176 /* make sure we remember we are banned in case there is an
178 rec->node_flags |= NODE_FLAGS_BANNED;
180 if (rec->banned_nodes[pnn] != NULL) {
181 DEBUG(DEBUG_NOTICE,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
182 talloc_free(rec->banned_nodes[pnn]);
183 rec->banned_nodes[pnn] = NULL;
186 rec->banned_nodes[pnn] = talloc(rec->banned_nodes, struct ban_state);
187 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
189 rec->banned_nodes[pnn]->rec = rec;
190 rec->banned_nodes[pnn]->banned_node = pnn;
193 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
194 timeval_current_ofs(ban_time, 0),
195 ctdb_ban_timeout, rec->banned_nodes[pnn]);
199 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
203 run the "recovered" eventscript on all nodes
205 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
210 tmp_ctx = talloc_new(ctdb);
211 CTDB_NO_MEMORY(ctdb, tmp_ctx);
213 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
214 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
216 CONTROL_TIMEOUT(), false, tdb_null,
219 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
221 talloc_free(tmp_ctx);
225 talloc_free(tmp_ctx);
230 remember the trouble maker
232 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
234 struct ctdb_context *ctdb = rec->ctdb;
236 if (rec->last_culprit != culprit ||
237 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
238 DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
239 /* either a new node is the culprit, or we've decided to forgive them */
240 rec->last_culprit = culprit;
241 rec->first_recover_time = timeval_current();
242 rec->culprit_counter = 0;
244 rec->culprit_counter++;
248 remember the trouble maker
250 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
252 struct ctdb_context *ctdb = rec->ctdb;
254 if (rec->last_culprit != culprit ||
255 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
256 DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
257 /* either a new node is the culprit, or we've decided to forgive them */
258 rec->last_culprit = culprit;
259 rec->first_recover_time = timeval_current();
260 rec->culprit_counter = 0;
262 rec->culprit_counter += count;
265 /* this callback is called for every node that failed to execute the
268 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
270 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
272 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
274 ctdb_set_culprit(rec, node_pnn);
278 run the "startrecovery" eventscript on all nodes
280 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
284 struct ctdb_context *ctdb = rec->ctdb;
286 tmp_ctx = talloc_new(ctdb);
287 CTDB_NO_MEMORY(ctdb, tmp_ctx);
289 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
290 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
292 CONTROL_TIMEOUT(), false, tdb_null,
294 startrecovery_fail_callback,
296 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
297 talloc_free(tmp_ctx);
301 talloc_free(tmp_ctx);
305 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
307 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
308 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
311 if (node_pnn < ctdb->num_nodes) {
312 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
317 update the node capabilities for all connected nodes
319 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
324 tmp_ctx = talloc_new(ctdb);
325 CTDB_NO_MEMORY(ctdb, tmp_ctx);
327 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
328 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
329 nodes, CONTROL_TIMEOUT(),
331 async_getcap_callback, NULL,
333 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
334 talloc_free(tmp_ctx);
338 talloc_free(tmp_ctx);
343 change recovery mode on all nodes
345 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
351 tmp_ctx = talloc_new(ctdb);
352 CTDB_NO_MEMORY(ctdb, tmp_ctx);
354 /* freeze all nodes */
355 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
356 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
357 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
358 nodes, CONTROL_TIMEOUT(),
362 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
363 talloc_free(tmp_ctx);
369 data.dsize = sizeof(uint32_t);
370 data.dptr = (unsigned char *)&rec_mode;
372 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
373 nodes, CONTROL_TIMEOUT(),
377 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
378 talloc_free(tmp_ctx);
382 talloc_free(tmp_ctx);
387 change recovery master on all node
389 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
395 tmp_ctx = talloc_new(ctdb);
396 CTDB_NO_MEMORY(ctdb, tmp_ctx);
398 data.dsize = sizeof(uint32_t);
399 data.dptr = (unsigned char *)&pnn;
401 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
402 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
404 CONTROL_TIMEOUT(), false, data,
407 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
408 talloc_free(tmp_ctx);
412 talloc_free(tmp_ctx);
418 ensure all other nodes have attached to any databases that we have
420 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
421 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
424 struct ctdb_dbid_map *remote_dbmap;
426 /* verify that all other nodes have all our databases */
427 for (j=0; j<nodemap->num; j++) {
428 /* we dont need to ourself ourselves */
429 if (nodemap->nodes[j].pnn == pnn) {
432 /* dont check nodes that are unavailable */
433 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
437 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
438 mem_ctx, &remote_dbmap);
440 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
444 /* step through all local databases */
445 for (db=0; db<dbmap->num;db++) {
449 for (i=0;i<remote_dbmap->num;i++) {
450 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
454 /* the remote node already have this database */
455 if (i!=remote_dbmap->num) {
458 /* ok so we need to create this database */
459 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
462 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
465 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
466 mem_ctx, name, dbmap->dbs[db].persistent);
468 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
479 ensure we are attached to any databases that anyone else is attached to
481 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
482 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
485 struct ctdb_dbid_map *remote_dbmap;
487 /* verify that we have all database any other node has */
488 for (j=0; j<nodemap->num; j++) {
489 /* we dont need to ourself ourselves */
490 if (nodemap->nodes[j].pnn == pnn) {
493 /* dont check nodes that are unavailable */
494 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
498 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
499 mem_ctx, &remote_dbmap);
501 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
505 /* step through all databases on the remote node */
506 for (db=0; db<remote_dbmap->num;db++) {
509 for (i=0;i<(*dbmap)->num;i++) {
510 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
514 /* we already have this db locally */
515 if (i!=(*dbmap)->num) {
518 /* ok so we need to create this database and
521 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
522 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
524 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
525 nodemap->nodes[j].pnn));
528 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
529 remote_dbmap->dbs[db].persistent);
531 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
534 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
536 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
547 pull the remote database contents from one node into the recdb
549 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
550 struct tdb_wrap *recdb, uint32_t dbid)
554 struct ctdb_marshall_buffer *reply;
555 struct ctdb_rec_data *rec;
557 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
559 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
560 CONTROL_TIMEOUT(), &outdata);
562 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
563 talloc_free(tmp_ctx);
567 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
569 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
570 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
571 talloc_free(tmp_ctx);
575 rec = (struct ctdb_rec_data *)&reply->data[0];
579 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
581 struct ctdb_ltdb_header *hdr;
584 key.dptr = &rec->data[0];
585 key.dsize = rec->keylen;
586 data.dptr = &rec->data[key.dsize];
587 data.dsize = rec->datalen;
589 hdr = (struct ctdb_ltdb_header *)data.dptr;
591 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
592 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
593 talloc_free(tmp_ctx);
597 /* fetch the existing record, if any */
598 existing = tdb_fetch(recdb->tdb, key);
600 if (existing.dptr != NULL) {
601 struct ctdb_ltdb_header header;
602 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
603 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
604 (unsigned)existing.dsize, srcnode));
606 talloc_free(tmp_ctx);
609 header = *(struct ctdb_ltdb_header *)existing.dptr;
611 if (!(header.rsn < hdr->rsn ||
612 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
617 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
618 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
619 talloc_free(tmp_ctx);
624 talloc_free(tmp_ctx);
630 pull all the remote database contents into the recdb
632 static int pull_remote_database(struct ctdb_context *ctdb,
633 struct ctdb_recoverd *rec,
634 struct ctdb_node_map *nodemap,
635 struct tdb_wrap *recdb, uint32_t dbid)
639 /* pull all records from all other nodes across onto this node
640 (this merges based on rsn)
642 for (j=0; j<nodemap->num; j++) {
643 /* dont merge from nodes that are unavailable */
644 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
647 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
648 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
649 nodemap->nodes[j].pnn));
650 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
660 update flags on all active nodes
662 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
666 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
668 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
676 ensure all nodes have the same vnnmap we do
678 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
679 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
683 /* push the new vnn map out to all the nodes */
684 for (j=0; j<nodemap->num; j++) {
685 /* dont push to nodes that are unavailable */
686 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
690 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
692 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
702 handler for when the admin bans a node
704 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
705 TDB_DATA data, void *private_data)
707 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
708 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
709 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
711 if (data.dsize != sizeof(*b)) {
712 DEBUG(DEBUG_ERR,("Bad data in ban_handler\n"));
713 talloc_free(mem_ctx);
717 if (b->pnn != ctdb->pnn) {
718 DEBUG(DEBUG_ERR,("Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request\n", b->pnn, ctdb->pnn));
722 DEBUG(DEBUG_NOTICE,("Node %u has been banned for %u seconds\n",
723 b->pnn, b->ban_time));
725 ctdb_ban_node(rec, b->pnn, b->ban_time);
726 talloc_free(mem_ctx);
730 handler for when the admin unbans a node
732 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
733 TDB_DATA data, void *private_data)
735 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
736 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
739 if (data.dsize != sizeof(uint32_t)) {
740 DEBUG(DEBUG_ERR,("Bad data in unban_handler\n"));
741 talloc_free(mem_ctx);
744 pnn = *(uint32_t *)data.dptr;
746 if (pnn != ctdb->pnn) {
747 DEBUG(DEBUG_ERR,("Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request\n", pnn, ctdb->pnn));
751 DEBUG(DEBUG_NOTICE,("Node %u has been unbanned.\n", pnn));
752 ctdb_unban_node(rec, pnn);
753 talloc_free(mem_ctx);
758 struct vacuum_info *next, *prev;
759 struct ctdb_recoverd *rec;
761 struct ctdb_db_context *ctdb_db;
762 struct ctdb_marshall_buffer *recs;
763 struct ctdb_rec_data *r;
766 static void vacuum_fetch_next(struct vacuum_info *v);
769 called when a vacuum fetch has completed - just free it and do the next one
771 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
773 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
775 vacuum_fetch_next(v);
780 process the next element from the vacuum list
782 static void vacuum_fetch_next(struct vacuum_info *v)
784 struct ctdb_call call;
785 struct ctdb_rec_data *r;
787 while (v->recs->count) {
788 struct ctdb_client_call_state *state;
790 struct ctdb_ltdb_header *hdr;
793 call.call_id = CTDB_NULL_FUNC;
794 call.flags = CTDB_IMMEDIATE_MIGRATION;
797 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
800 call.key.dptr = &r->data[0];
801 call.key.dsize = r->keylen;
803 /* ensure we don't block this daemon - just skip a record if we can't get
805 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
809 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
810 if (data.dptr == NULL) {
811 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
815 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
817 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
821 hdr = (struct ctdb_ltdb_header *)data.dptr;
822 if (hdr->dmaster == v->rec->ctdb->pnn) {
823 /* its already local */
825 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
831 state = ctdb_call_send(v->ctdb_db, &call);
832 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
834 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
838 state->async.fn = vacuum_fetch_callback;
839 state->async.private_data = v;
848 destroy a vacuum info structure
850 static int vacuum_info_destructor(struct vacuum_info *v)
852 DLIST_REMOVE(v->rec->vacuum_info, v);
858 handler for vacuum fetch
860 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
861 TDB_DATA data, void *private_data)
863 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
864 struct ctdb_marshall_buffer *recs;
866 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
868 struct ctdb_dbid_map *dbmap=NULL;
869 bool persistent = false;
870 struct ctdb_db_context *ctdb_db;
871 struct ctdb_rec_data *r;
873 struct vacuum_info *v;
875 recs = (struct ctdb_marshall_buffer *)data.dptr;
876 r = (struct ctdb_rec_data *)&recs->data[0];
878 if (recs->count == 0) {
879 talloc_free(tmp_ctx);
885 for (v=rec->vacuum_info;v;v=v->next) {
886 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
887 /* we're already working on records from this node */
888 talloc_free(tmp_ctx);
893 /* work out if the database is persistent */
894 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
896 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
897 talloc_free(tmp_ctx);
901 for (i=0;i<dbmap->num;i++) {
902 if (dbmap->dbs[i].dbid == recs->db_id) {
903 persistent = dbmap->dbs[i].persistent;
907 if (i == dbmap->num) {
908 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
909 talloc_free(tmp_ctx);
913 /* find the name of this database */
914 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
915 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
916 talloc_free(tmp_ctx);
921 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
922 if (ctdb_db == NULL) {
923 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
924 talloc_free(tmp_ctx);
928 v = talloc_zero(rec, struct vacuum_info);
930 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
931 talloc_free(tmp_ctx);
936 v->srcnode = srcnode;
937 v->ctdb_db = ctdb_db;
938 v->recs = talloc_memdup(v, recs, data.dsize);
939 if (v->recs == NULL) {
940 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
942 talloc_free(tmp_ctx);
945 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
947 DLIST_ADD(rec->vacuum_info, v);
949 talloc_set_destructor(v, vacuum_info_destructor);
951 vacuum_fetch_next(v);
952 talloc_free(tmp_ctx);
957 called when ctdb_wait_timeout should finish
959 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
960 struct timeval yt, void *p)
962 uint32_t *timed_out = (uint32_t *)p;
967 wait for a given number of seconds
969 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
971 uint32_t timed_out = 0;
972 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
974 event_loop_once(ctdb->ev);
979 called when an election times out (ends)
981 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
982 struct timeval t, void *p)
984 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
985 rec->election_timeout = NULL;
990 wait for an election to finish. It finished election_timeout seconds after
991 the last election packet is received
993 static void ctdb_wait_election(struct ctdb_recoverd *rec)
995 struct ctdb_context *ctdb = rec->ctdb;
996 while (rec->election_timeout) {
997 event_loop_once(ctdb->ev);
1002 Update our local flags from all remote connected nodes.
1003 This is only run when we are or we belive we are the recovery master
1005 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1008 struct ctdb_context *ctdb = rec->ctdb;
1009 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1011 /* get the nodemap for all active remote nodes and verify
1012 they are the same as for this node
1014 for (j=0; j<nodemap->num; j++) {
1015 struct ctdb_node_map *remote_nodemap=NULL;
1018 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1021 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1025 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1026 mem_ctx, &remote_nodemap);
1028 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1029 nodemap->nodes[j].pnn));
1030 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1031 talloc_free(mem_ctx);
1032 return MONITOR_FAILED;
1034 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1035 int ban_changed = (nodemap->nodes[j].flags ^ remote_nodemap->nodes[j].flags) & NODE_FLAGS_BANNED;
1038 DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
1039 nodemap->nodes[j].pnn,
1040 remote_nodemap->nodes[j].flags,
1041 nodemap->nodes[j].flags));
1044 /* We should tell our daemon about this so it
1045 updates its flags or else we will log the same
1046 message again in the next iteration of recovery.
1047 Since we are the recovery master we can just as
1048 well update the flags on all nodes.
1050 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1052 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1056 /* Update our local copy of the flags in the recovery
1059 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1060 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1061 nodemap->nodes[j].flags));
1062 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1064 /* If the BANNED flag has changed for the node
1065 this is a good reason to do a new election.
1068 talloc_free(mem_ctx);
1069 return MONITOR_ELECTION_NEEDED;
1073 talloc_free(remote_nodemap);
1075 talloc_free(mem_ctx);
1080 /* Create a new random generation ip.
1081 The generation id can not be the INVALID_GENERATION id
1083 static uint32_t new_generation(void)
1085 uint32_t generation;
1088 generation = random();
1090 if (generation != INVALID_GENERATION) {
1100 create a temporary working database
1102 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1105 struct tdb_wrap *recdb;
1108 /* open up the temporary recovery database */
1109 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1115 tdb_flags = TDB_NOLOCK;
1116 if (!ctdb->do_setsched) {
1117 tdb_flags |= TDB_NOMMAP;
1120 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1121 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1122 if (recdb == NULL) {
1123 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1133 a traverse function for pulling all relevent records from recdb
1136 struct ctdb_context *ctdb;
1137 struct ctdb_marshall_buffer *recdata;
1142 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1144 struct recdb_data *params = (struct recdb_data *)p;
1145 struct ctdb_rec_data *rec;
1146 struct ctdb_ltdb_header *hdr;
1148 /* skip empty records */
1149 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1153 /* update the dmaster field to point to us */
1154 hdr = (struct ctdb_ltdb_header *)data.dptr;
1155 hdr->dmaster = params->ctdb->pnn;
1157 /* add the record to the blob ready to send to the nodes */
1158 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1160 params->failed = true;
1163 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1164 if (params->recdata == NULL) {
1165 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1166 rec->length + params->len, params->recdata->count));
1167 params->failed = true;
1170 params->recdata->count++;
1171 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1172 params->len += rec->length;
1179 push the recdb database out to all nodes
1181 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1182 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1184 struct recdb_data params;
1185 struct ctdb_marshall_buffer *recdata;
1187 TALLOC_CTX *tmp_ctx;
1190 tmp_ctx = talloc_new(ctdb);
1191 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1193 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1194 CTDB_NO_MEMORY(ctdb, recdata);
1196 recdata->db_id = dbid;
1199 params.recdata = recdata;
1200 params.len = offsetof(struct ctdb_marshall_buffer, data);
1201 params.failed = false;
1203 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1204 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1205 talloc_free(params.recdata);
1206 talloc_free(tmp_ctx);
1210 if (params.failed) {
1211 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1212 talloc_free(params.recdata);
1213 talloc_free(tmp_ctx);
1217 recdata = params.recdata;
1219 outdata.dptr = (void *)recdata;
1220 outdata.dsize = params.len;
1222 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1223 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1225 CONTROL_TIMEOUT(), false, outdata,
1228 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1229 talloc_free(recdata);
1230 talloc_free(tmp_ctx);
1234 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1235 dbid, recdata->count));
1237 talloc_free(recdata);
1238 talloc_free(tmp_ctx);
1245 go through a full recovery on one database
1247 static int recover_database(struct ctdb_recoverd *rec,
1248 TALLOC_CTX *mem_ctx,
1251 struct ctdb_node_map *nodemap,
1252 uint32_t transaction_id)
1254 struct tdb_wrap *recdb;
1256 struct ctdb_context *ctdb = rec->ctdb;
1258 struct ctdb_control_wipe_database w;
1261 recdb = create_recdb(ctdb, mem_ctx);
1262 if (recdb == NULL) {
1266 /* pull all remote databases onto the recdb */
1267 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1269 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1273 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1275 /* wipe all the remote databases. This is safe as we are in a transaction */
1277 w.transaction_id = transaction_id;
1279 data.dptr = (void *)&w;
1280 data.dsize = sizeof(w);
1282 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1285 CONTROL_TIMEOUT(), false, data,
1288 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1293 /* push out the correct database. This sets the dmaster and skips
1294 the empty records */
1295 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1301 /* all done with this database */
1308 reload the nodes file
1310 static void reload_nodes_file(struct ctdb_context *ctdb)
1313 ctdb_load_nodes_file(ctdb);
1318 we are the recmaster, and recovery is needed - start a recovery run
1320 static int do_recovery(struct ctdb_recoverd *rec,
1321 TALLOC_CTX *mem_ctx, uint32_t pnn,
1322 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
1325 struct ctdb_context *ctdb = rec->ctdb;
1327 uint32_t generation;
1328 struct ctdb_dbid_map *dbmap;
1331 struct timeval start_time;
1333 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1335 /* if recovery fails, force it again */
1336 rec->need_recovery = true;
1338 if (culprit != -1) {
1339 ctdb_set_culprit(rec, culprit);
1342 if (rec->culprit_counter > 2*nodemap->num) {
1343 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
1344 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
1345 ctdb->tunable.recovery_ban_period));
1346 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
1349 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1350 start_time = timeval_current();
1351 if (!ctdb_recovery_lock(ctdb, true)) {
1352 ctdb_set_culprit(rec, pnn);
1353 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1356 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1357 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1359 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
1361 /* get a list of all databases */
1362 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1364 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1368 /* we do the db creation before we set the recovery mode, so the freeze happens
1369 on all databases we will be dealing with. */
1371 /* verify that we have all the databases any other node has */
1372 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1374 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1378 /* verify that all other nodes have all our databases */
1379 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1381 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1385 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1388 /* set recovery mode to active on all nodes */
1389 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1391 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1395 /* execute the "startrecovery" event script on all nodes */
1396 ret = run_startrecovery_eventscript(rec, nodemap);
1398 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1402 /* pick a new generation number */
1403 generation = new_generation();
1405 /* change the vnnmap on this node to use the new generation
1406 number but not on any other nodes.
1407 this guarantees that if we abort the recovery prematurely
1408 for some reason (a node stops responding?)
1409 that we can just return immediately and we will reenter
1410 recovery shortly again.
1411 I.e. we deliberately leave the cluster with an inconsistent
1412 generation id to allow us to abort recovery at any stage and
1413 just restart it from scratch.
1415 vnnmap->generation = generation;
1416 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1418 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1422 data.dptr = (void *)&generation;
1423 data.dsize = sizeof(uint32_t);
1425 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1426 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1428 CONTROL_TIMEOUT(), false, data,
1431 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1435 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1437 for (i=0;i<dbmap->num;i++) {
1438 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1439 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1444 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1446 /* commit all the changes */
1447 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1449 CONTROL_TIMEOUT(), false, data,
1452 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1456 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1459 /* update the capabilities for all nodes */
1460 ret = update_capabilities(ctdb, nodemap);
1462 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1466 /* build a new vnn map with all the currently active and
1468 generation = new_generation();
1469 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1470 CTDB_NO_MEMORY(ctdb, vnnmap);
1471 vnnmap->generation = generation;
1473 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1474 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1475 for (i=j=0;i<nodemap->num;i++) {
1476 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1479 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1480 /* this node can not be an lmaster */
1481 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1486 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1487 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1488 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1491 if (vnnmap->size == 0) {
1492 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1494 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1495 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1496 vnnmap->map[0] = pnn;
1499 /* update to the new vnnmap on all nodes */
1500 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1502 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1506 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1508 /* update recmaster to point to us for all nodes */
1509 ret = set_recovery_master(ctdb, nodemap, pnn);
1511 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1515 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1518 update all nodes to have the same flags that we have
1520 for (i=0;i<nodemap->num;i++) {
1521 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1525 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1527 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1532 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1534 /* disable recovery mode */
1535 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
1537 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1541 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1544 tell nodes to takeover their public IPs
1546 rec->need_takeover_run = false;
1547 ret = ctdb_takeover_run(ctdb, nodemap);
1549 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1552 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1554 /* execute the "recovered" event script on all nodes */
1555 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1557 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1561 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1563 /* send a message to all clients telling them that the cluster
1564 has been reconfigured */
1565 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1567 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1569 rec->need_recovery = false;
1571 /* We just finished a recovery successfully.
1572 We now wait for rerecovery_timeout before we allow
1573 another recovery to take place.
1575 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1576 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1577 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1584 elections are won by first checking the number of connected nodes, then
1585 the priority time, then the pnn
1587 struct election_message {
1588 uint32_t num_connected;
1589 struct timeval priority_time;
1591 uint32_t node_flags;
1595 form this nodes election data
1597 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1600 struct ctdb_node_map *nodemap;
1601 struct ctdb_context *ctdb = rec->ctdb;
1605 em->pnn = rec->ctdb->pnn;
1606 em->priority_time = rec->priority_time;
1607 em->node_flags = rec->node_flags;
1609 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1611 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1615 for (i=0;i<nodemap->num;i++) {
1616 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1617 em->num_connected++;
1621 /* we shouldnt try to win this election if we cant be a recmaster */
1622 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1623 em->num_connected = 0;
1624 em->priority_time = timeval_current();
1627 talloc_free(nodemap);
1631 see if the given election data wins
1633 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1635 struct election_message myem;
1638 ctdb_election_data(rec, &myem);
1640 /* we cant win if we dont have the recmaster capability */
1641 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1645 /* we cant win if we are banned */
1646 if (rec->node_flags & NODE_FLAGS_BANNED) {
1650 /* we will automatically win if the other node is banned */
1651 if (em->node_flags & NODE_FLAGS_BANNED) {
1655 /* try to use the most connected node */
1657 cmp = (int)myem.num_connected - (int)em->num_connected;
1660 /* then the longest running node */
1662 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1666 cmp = (int)myem.pnn - (int)em->pnn;
1673 send out an election request
1675 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1678 TDB_DATA election_data;
1679 struct election_message emsg;
1681 struct ctdb_context *ctdb = rec->ctdb;
1683 srvid = CTDB_SRVID_RECOVERY;
1685 ctdb_election_data(rec, &emsg);
1687 election_data.dsize = sizeof(struct election_message);
1688 election_data.dptr = (unsigned char *)&emsg;
1691 /* send an election message to all active nodes */
1692 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1695 /* A new node that is already frozen has entered the cluster.
1696 The existing nodes are not frozen and dont need to be frozen
1697 until the election has ended and we start the actual recovery
1699 if (update_recmaster == true) {
1700 /* first we assume we will win the election and set
1701 recoverymaster to be ourself on the current node
1703 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1705 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1715 this function will unban all nodes in the cluster
1717 static void unban_all_nodes(struct ctdb_context *ctdb)
1720 struct ctdb_node_map *nodemap;
1721 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1723 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1725 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1729 for (i=0;i<nodemap->num;i++) {
1730 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1731 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1732 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1736 talloc_free(tmp_ctx);
1741 we think we are winning the election - send a broadcast election request
1743 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1745 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1748 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1750 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1753 talloc_free(rec->send_election_te);
1754 rec->send_election_te = NULL;
1758 handler for memory dumps
1760 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1761 TDB_DATA data, void *private_data)
1763 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1766 struct rd_memdump_reply *rd;
1768 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1769 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1770 talloc_free(tmp_ctx);
1773 rd = (struct rd_memdump_reply *)data.dptr;
1775 dump = talloc_zero(tmp_ctx, TDB_DATA);
1777 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1778 talloc_free(tmp_ctx);
1781 ret = ctdb_dump_memory(ctdb, dump);
1783 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1784 talloc_free(tmp_ctx);
1788 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1790 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1792 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1793 talloc_free(tmp_ctx);
1797 talloc_free(tmp_ctx);
1801 handler for reload_nodes
1803 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1804 TDB_DATA data, void *private_data)
1806 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1808 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1810 reload_nodes_file(rec->ctdb);
1816 handler for recovery master elections
1818 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1819 TDB_DATA data, void *private_data)
1821 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1823 struct election_message *em = (struct election_message *)data.dptr;
1824 TALLOC_CTX *mem_ctx;
1826 /* we got an election packet - update the timeout for the election */
1827 talloc_free(rec->election_timeout);
1828 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1829 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1830 ctdb_election_timeout, rec);
1832 mem_ctx = talloc_new(ctdb);
1834 /* someone called an election. check their election data
1835 and if we disagree and we would rather be the elected node,
1836 send a new election message to all other nodes
1838 if (ctdb_election_win(rec, em)) {
1839 if (!rec->send_election_te) {
1840 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1841 timeval_current_ofs(0, 500000),
1842 election_send_request, rec);
1844 talloc_free(mem_ctx);
1845 /*unban_all_nodes(ctdb);*/
1850 talloc_free(rec->send_election_te);
1851 rec->send_election_te = NULL;
1853 /* release the recmaster lock */
1854 if (em->pnn != ctdb->pnn &&
1855 ctdb->recovery_lock_fd != -1) {
1856 close(ctdb->recovery_lock_fd);
1857 ctdb->recovery_lock_fd = -1;
1858 unban_all_nodes(ctdb);
1861 /* ok, let that guy become recmaster then */
1862 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1864 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1865 talloc_free(mem_ctx);
1869 /* release any bans */
1870 rec->last_culprit = (uint32_t)-1;
1871 talloc_free(rec->banned_nodes);
1872 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1873 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1875 talloc_free(mem_ctx);
1881 force the start of the election process
1883 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1884 struct ctdb_node_map *nodemap)
1887 struct ctdb_context *ctdb = rec->ctdb;
1889 /* set all nodes to recovery mode to stop all internode traffic */
1890 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1892 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1896 talloc_free(rec->election_timeout);
1897 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1898 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1899 ctdb_election_timeout, rec);
1901 ret = send_election_request(rec, pnn, true);
1903 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1907 /* wait for a few seconds to collect all responses */
1908 ctdb_wait_election(rec);
1914 handler for when a node changes its flags
1916 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1917 TDB_DATA data, void *private_data)
1920 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1921 struct ctdb_node_map *nodemap=NULL;
1922 TALLOC_CTX *tmp_ctx;
1923 uint32_t changed_flags;
1925 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1927 if (data.dsize != sizeof(*c)) {
1928 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1932 tmp_ctx = talloc_new(ctdb);
1933 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1935 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1937 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1938 talloc_free(tmp_ctx);
1943 for (i=0;i<nodemap->num;i++) {
1944 if (nodemap->nodes[i].pnn == c->pnn) break;
1947 if (i == nodemap->num) {
1948 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1949 talloc_free(tmp_ctx);
1953 changed_flags = c->old_flags ^ c->new_flags;
1955 if (nodemap->nodes[i].flags != c->new_flags) {
1956 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1959 nodemap->nodes[i].flags = c->new_flags;
1961 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1962 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1965 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1966 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1970 ctdb->recovery_master == ctdb->pnn &&
1971 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
1972 /* Only do the takeover run if the perm disabled or unhealthy
1973 flags changed since these will cause an ip failover but not
1975 If the node became disconnected or banned this will also
1976 lead to an ip address failover but that is handled
1979 if (changed_flags & NODE_FLAGS_DISABLED) {
1980 rec->need_takeover_run = true;
1984 talloc_free(tmp_ctx);
1988 handler for when we need to push out flag changes ot all other nodes
1990 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
1991 TDB_DATA data, void *private_data)
1994 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1996 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
1998 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2003 struct verify_recmode_normal_data {
2005 enum monitor_result status;
2008 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2010 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2013 /* one more node has responded with recmode data*/
2016 /* if we failed to get the recmode, then return an error and let
2017 the main loop try again.
2019 if (state->state != CTDB_CONTROL_DONE) {
2020 if (rmdata->status == MONITOR_OK) {
2021 rmdata->status = MONITOR_FAILED;
2026 /* if we got a response, then the recmode will be stored in the
2029 if (state->status != CTDB_RECOVERY_NORMAL) {
2030 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2031 rmdata->status = MONITOR_RECOVERY_NEEDED;
2038 /* verify that all nodes are in normal recovery mode */
2039 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2041 struct verify_recmode_normal_data *rmdata;
2042 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2043 struct ctdb_client_control_state *state;
2044 enum monitor_result status;
2047 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2048 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2050 rmdata->status = MONITOR_OK;
2052 /* loop over all active nodes and send an async getrecmode call to
2054 for (j=0; j<nodemap->num; j++) {
2055 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2058 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2060 nodemap->nodes[j].pnn);
2061 if (state == NULL) {
2062 /* we failed to send the control, treat this as
2063 an error and try again next iteration
2065 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2066 talloc_free(mem_ctx);
2067 return MONITOR_FAILED;
2070 /* set up the callback functions */
2071 state->async.fn = verify_recmode_normal_callback;
2072 state->async.private_data = rmdata;
2074 /* one more control to wait for to complete */
2079 /* now wait for up to the maximum number of seconds allowed
2080 or until all nodes we expect a response from has replied
2082 while (rmdata->count > 0) {
2083 event_loop_once(ctdb->ev);
2086 status = rmdata->status;
2087 talloc_free(mem_ctx);
2092 struct verify_recmaster_data {
2093 struct ctdb_recoverd *rec;
2096 enum monitor_result status;
2099 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2101 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2104 /* one more node has responded with recmaster data*/
2107 /* if we failed to get the recmaster, then return an error and let
2108 the main loop try again.
2110 if (state->state != CTDB_CONTROL_DONE) {
2111 if (rmdata->status == MONITOR_OK) {
2112 rmdata->status = MONITOR_FAILED;
2117 /* if we got a response, then the recmaster will be stored in the
2120 if (state->status != rmdata->pnn) {
2121 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2122 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2123 rmdata->status = MONITOR_ELECTION_NEEDED;
2130 /* verify that all nodes agree that we are the recmaster */
2131 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2133 struct ctdb_context *ctdb = rec->ctdb;
2134 struct verify_recmaster_data *rmdata;
2135 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2136 struct ctdb_client_control_state *state;
2137 enum monitor_result status;
2140 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2141 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2145 rmdata->status = MONITOR_OK;
2147 /* loop over all active nodes and send an async getrecmaster call to
2149 for (j=0; j<nodemap->num; j++) {
2150 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2153 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2155 nodemap->nodes[j].pnn);
2156 if (state == NULL) {
2157 /* we failed to send the control, treat this as
2158 an error and try again next iteration
2160 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2161 talloc_free(mem_ctx);
2162 return MONITOR_FAILED;
2165 /* set up the callback functions */
2166 state->async.fn = verify_recmaster_callback;
2167 state->async.private_data = rmdata;
2169 /* one more control to wait for to complete */
2174 /* now wait for up to the maximum number of seconds allowed
2175 or until all nodes we expect a response from has replied
2177 while (rmdata->count > 0) {
2178 event_loop_once(ctdb->ev);
2181 status = rmdata->status;
2182 talloc_free(mem_ctx);
2187 /* called to check that the allocation of public ip addresses is ok.
2189 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2191 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2192 struct ctdb_all_public_ips *ips = NULL;
2193 struct ctdb_uptime *uptime1 = NULL;
2194 struct ctdb_uptime *uptime2 = NULL;
2197 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2198 CTDB_CURRENT_NODE, &uptime1);
2200 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2201 talloc_free(mem_ctx);
2205 /* read the ip allocation from the local node */
2206 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2208 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2209 talloc_free(mem_ctx);
2213 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2214 CTDB_CURRENT_NODE, &uptime2);
2216 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2217 talloc_free(mem_ctx);
2221 /* skip the check if the startrecovery time has changed */
2222 if (timeval_compare(&uptime1->last_recovery_started,
2223 &uptime2->last_recovery_started) != 0) {
2224 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2225 talloc_free(mem_ctx);
2229 /* skip the check if the endrecovery time has changed */
2230 if (timeval_compare(&uptime1->last_recovery_finished,
2231 &uptime2->last_recovery_finished) != 0) {
2232 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2233 talloc_free(mem_ctx);
2237 /* skip the check if we have started but not finished recovery */
2238 if (timeval_compare(&uptime1->last_recovery_finished,
2239 &uptime1->last_recovery_started) != 1) {
2240 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2241 talloc_free(mem_ctx);
2246 /* verify that we have the ip addresses we should have
2247 and we dont have ones we shouldnt have.
2248 if we find an inconsistency we set recmode to
2249 active on the local node and wait for the recmaster
2250 to do a full blown recovery
2252 for (j=0; j<ips->num; j++) {
2253 if (ips->ips[j].pnn == pnn) {
2254 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2255 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2256 ctdb_addr_to_str(&ips->ips[j].addr)));
2257 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2259 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2261 talloc_free(mem_ctx);
2264 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2266 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2268 talloc_free(mem_ctx);
2273 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2274 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2275 ctdb_addr_to_str(&ips->ips[j].addr)));
2277 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2279 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2281 talloc_free(mem_ctx);
2284 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2286 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2288 talloc_free(mem_ctx);
2295 talloc_free(mem_ctx);
2300 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2302 struct ctdb_node_map **remote_nodemaps = callback_data;
2304 if (node_pnn >= ctdb->num_nodes) {
2305 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2309 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2313 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2314 struct ctdb_node_map *nodemap,
2315 struct ctdb_node_map **remote_nodemaps)
2319 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2320 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2322 CONTROL_TIMEOUT(), false, tdb_null,
2323 async_getnodemap_callback,
2325 remote_nodemaps) != 0) {
2326 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2334 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2335 struct ctdb_check_reclock_state {
2336 struct ctdb_context *ctdb;
2337 struct timeval start_time;
2340 struct timed_event *te;
2341 struct fd_event *fde;
2342 enum reclock_child_status status;
2345 /* when we free the reclock state we must kill any child process.
2347 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2349 struct ctdb_context *ctdb = state->ctdb;
2351 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2353 if (state->fd[0] != -1) {
2354 close(state->fd[0]);
2357 if (state->fd[1] != -1) {
2358 close(state->fd[1]);
2361 kill(state->child, SIGKILL);
2366 called if our check_reclock child times out. this would happen if
2367 i/o to the reclock file blocks.
2369 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2370 struct timeval t, void *private_data)
2372 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2373 struct ctdb_check_reclock_state);
2375 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2376 state->status = RECLOCK_TIMEOUT;
2379 /* this is called when the child process has completed checking the reclock
2380 file and has written data back to us through the pipe.
2382 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2383 uint16_t flags, void *private_data)
2385 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2386 struct ctdb_check_reclock_state);
2390 /* we got a response from our child process so we can abort the
2393 talloc_free(state->te);
2396 ret = read(state->fd[0], &c, 1);
2397 if (ret != 1 || c != RECLOCK_OK) {
2398 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2399 state->status = RECLOCK_FAILED;
2404 state->status = RECLOCK_OK;
2408 static int check_recovery_lock(struct ctdb_context *ctdb)
2411 struct ctdb_check_reclock_state *state;
2412 pid_t parent = getpid();
2414 if (ctdb->recovery_lock_fd == -1) {
2415 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2419 state = talloc(ctdb, struct ctdb_check_reclock_state);
2420 CTDB_NO_MEMORY(ctdb, state);
2423 state->start_time = timeval_current();
2424 state->status = RECLOCK_CHECKING;
2428 ret = pipe(state->fd);
2431 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2435 state->child = fork();
2436 if (state->child == (pid_t)-1) {
2437 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2438 close(state->fd[0]);
2440 close(state->fd[1]);
2446 if (state->child == 0) {
2447 char cc = RECLOCK_OK;
2448 close(state->fd[0]);
2451 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2452 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2453 cc = RECLOCK_FAILED;
2456 write(state->fd[1], &cc, 1);
2457 /* make sure we die when our parent dies */
2458 while (kill(parent, 0) == 0 || errno != ESRCH) {
2460 write(state->fd[1], &cc, 1);
2464 close(state->fd[1]);
2467 talloc_set_destructor(state, check_reclock_destructor);
2469 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2470 ctdb_check_reclock_timeout, state);
2471 if (state->te == NULL) {
2472 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2477 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2478 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2479 reclock_child_handler,
2482 if (state->fde == NULL) {
2483 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2488 while (state->status == RECLOCK_CHECKING) {
2489 event_loop_once(ctdb->ev);
2492 if (state->status == RECLOCK_FAILED) {
2493 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2494 close(ctdb->recovery_lock_fd);
2495 ctdb->recovery_lock_fd = -1;
2505 the main monitoring loop
2507 static void monitor_cluster(struct ctdb_context *ctdb)
2510 TALLOC_CTX *mem_ctx=NULL;
2511 struct ctdb_node_map *nodemap=NULL;
2512 struct ctdb_node_map *recmaster_nodemap=NULL;
2513 struct ctdb_node_map **remote_nodemaps=NULL;
2514 struct ctdb_vnn_map *vnnmap=NULL;
2515 struct ctdb_vnn_map *remote_vnnmap=NULL;
2516 int32_t debug_level;
2518 struct ctdb_recoverd *rec;
2520 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2522 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2523 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2526 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
2527 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
2529 rec->priority_time = timeval_current();
2531 /* register a message port for sending memory dumps */
2532 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2534 /* register a message port for recovery elections */
2535 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2537 /* when nodes are disabled/enabled */
2538 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2540 /* when we are asked to puch out a flag change */
2541 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2543 /* when nodes are banned */
2544 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
2546 /* and one for when nodes are unbanned */
2547 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
2549 /* register a message port for vacuum fetch */
2550 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2552 /* register a message port for reloadnodes */
2553 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2557 talloc_free(mem_ctx);
2560 mem_ctx = talloc_new(ctdb);
2562 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2566 /* we only check for recovery once every second */
2567 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2569 /* verify that the main daemon is still running */
2570 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2571 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2575 /* ping the local daemon to tell it we are alive */
2576 ctdb_ctrl_recd_ping(ctdb);
2578 if (rec->election_timeout) {
2579 /* an election is in progress */
2583 /* read the debug level from the parent and update locally */
2584 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2586 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2589 LogLevel = debug_level;
2592 /* We must check if we need to ban a node here but we want to do this
2593 as early as possible so we dont wait until we have pulled the node
2594 map from the local node. thats why we have the hardcoded value 20
2596 if (rec->culprit_counter > 20) {
2597 DEBUG(DEBUG_NOTICE,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
2598 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
2599 ctdb->tunable.recovery_ban_period));
2600 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
2603 /* get relevant tunables */
2604 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2606 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2610 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2611 if (pnn == (uint32_t)-1) {
2612 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2616 /* get the vnnmap */
2617 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2619 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2624 /* get number of nodes */
2626 talloc_free(rec->nodemap);
2627 rec->nodemap = NULL;
2630 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2632 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2635 nodemap = rec->nodemap;
2637 /* check which node is the recovery master */
2638 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2640 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2644 if (rec->recmaster == (uint32_t)-1) {
2645 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2646 force_election(rec, pnn, nodemap);
2650 /* check that we (recovery daemon) and the local ctdb daemon
2651 agrees on whether we are banned or not
2653 if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
2654 if (rec->banned_nodes[pnn] == NULL) {
2655 if (rec->recmaster == pnn) {
2656 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2658 ctdb_unban_node(rec, pnn);
2660 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2661 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2662 ctdb_set_culprit(rec, pnn);
2667 if (rec->banned_nodes[pnn] != NULL) {
2668 if (rec->recmaster == pnn) {
2669 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2671 ctdb_unban_node(rec, pnn);
2673 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2675 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2676 ctdb_set_culprit(rec, pnn);
2682 /* remember our own node flags */
2683 rec->node_flags = nodemap->nodes[pnn].flags;
2685 /* count how many active nodes there are */
2686 rec->num_active = 0;
2687 rec->num_connected = 0;
2688 for (i=0; i<nodemap->num; i++) {
2689 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2692 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2693 rec->num_connected++;
2698 /* verify that the recmaster node is still active */
2699 for (j=0; j<nodemap->num; j++) {
2700 if (nodemap->nodes[j].pnn==rec->recmaster) {
2705 if (j == nodemap->num) {
2706 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2707 force_election(rec, pnn, nodemap);
2711 /* if recovery master is disconnected we must elect a new recmaster */
2712 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2713 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2714 force_election(rec, pnn, nodemap);
2718 /* grap the nodemap from the recovery master to check if it is banned */
2719 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2720 mem_ctx, &recmaster_nodemap);
2722 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2723 nodemap->nodes[j].pnn));
2728 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2729 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2730 force_election(rec, pnn, nodemap);
2735 /* verify that we have all ip addresses we should have and we dont
2736 * have addresses we shouldnt have.
2738 if (ctdb->do_checkpublicip) {
2739 if (verify_ip_allocation(ctdb, pnn) != 0) {
2740 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2746 /* if we are not the recmaster then we do not need to check
2747 if recovery is needed
2749 if (pnn != rec->recmaster) {
2754 /* ensure our local copies of flags are right */
2755 ret = update_local_flags(rec, nodemap);
2756 if (ret == MONITOR_ELECTION_NEEDED) {
2757 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2758 force_election(rec, pnn, nodemap);
2761 if (ret != MONITOR_OK) {
2762 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2766 /* update the list of public ips that a node can handle for
2769 if (ctdb->num_nodes != nodemap->num) {
2770 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2771 reload_nodes_file(ctdb);
2774 for (j=0; j<nodemap->num; j++) {
2775 /* release any existing data */
2776 if (ctdb->nodes[j]->public_ips) {
2777 talloc_free(ctdb->nodes[j]->public_ips);
2778 ctdb->nodes[j]->public_ips = NULL;
2781 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2785 /* grab a new shiny list of public ips from the node */
2786 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2787 ctdb->nodes[j]->pnn,
2789 &ctdb->nodes[j]->public_ips)) {
2790 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2791 ctdb->nodes[j]->pnn));
2797 /* verify that all active nodes agree that we are the recmaster */
2798 switch (verify_recmaster(rec, nodemap, pnn)) {
2799 case MONITOR_RECOVERY_NEEDED:
2800 /* can not happen */
2802 case MONITOR_ELECTION_NEEDED:
2803 force_election(rec, pnn, nodemap);
2807 case MONITOR_FAILED:
2812 if (rec->need_recovery) {
2813 /* a previous recovery didn't finish */
2814 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, -1);
2818 /* verify that all active nodes are in normal mode
2819 and not in recovery mode
2821 switch (verify_recmode(ctdb, nodemap)) {
2822 case MONITOR_RECOVERY_NEEDED:
2823 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2825 case MONITOR_FAILED:
2827 case MONITOR_ELECTION_NEEDED:
2828 /* can not happen */
2834 /* we should have the reclock - check its not stale */
2835 ret = check_recovery_lock(ctdb);
2837 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
2838 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2842 /* get the nodemap for all active remote nodes
2844 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
2845 if (remote_nodemaps == NULL) {
2846 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2849 for(i=0; i<nodemap->num; i++) {
2850 remote_nodemaps[i] = NULL;
2852 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2853 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2857 /* verify that all other nodes have the same nodemap as we have
2859 for (j=0; j<nodemap->num; j++) {
2860 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2864 if (remote_nodemaps[j] == NULL) {
2865 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2866 ctdb_set_culprit(rec, j);
2871 /* if the nodes disagree on how many nodes there are
2872 then this is a good reason to try recovery
2874 if (remote_nodemaps[j]->num != nodemap->num) {
2875 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2876 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2877 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2881 /* if the nodes disagree on which nodes exist and are
2882 active, then that is also a good reason to do recovery
2884 for (i=0;i<nodemap->num;i++) {
2885 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2886 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2887 nodemap->nodes[j].pnn, i,
2888 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2889 do_recovery(rec, mem_ctx, pnn, nodemap,
2890 vnnmap, nodemap->nodes[j].pnn);
2895 /* verify the flags are consistent
2897 for (i=0; i<nodemap->num; i++) {
2898 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2902 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2903 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2904 nodemap->nodes[j].pnn,
2905 nodemap->nodes[i].pnn,
2906 remote_nodemaps[j]->nodes[i].flags,
2907 nodemap->nodes[j].flags));
2909 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2910 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2911 do_recovery(rec, mem_ctx, pnn, nodemap,
2912 vnnmap, nodemap->nodes[j].pnn);
2915 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2916 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2917 do_recovery(rec, mem_ctx, pnn, nodemap,
2918 vnnmap, nodemap->nodes[j].pnn);
2926 /* there better be the same number of lmasters in the vnn map
2927 as there are active nodes or we will have to do a recovery
2929 if (vnnmap->size != rec->num_active) {
2930 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
2931 vnnmap->size, rec->num_active));
2932 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2936 /* verify that all active nodes in the nodemap also exist in
2939 for (j=0; j<nodemap->num; j++) {
2940 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2943 if (nodemap->nodes[j].pnn == pnn) {
2947 for (i=0; i<vnnmap->size; i++) {
2948 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2952 if (i == vnnmap->size) {
2953 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2954 nodemap->nodes[j].pnn));
2955 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2961 /* verify that all other nodes have the same vnnmap
2962 and are from the same generation
2964 for (j=0; j<nodemap->num; j++) {
2965 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2968 if (nodemap->nodes[j].pnn == pnn) {
2972 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2973 mem_ctx, &remote_vnnmap);
2975 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2976 nodemap->nodes[j].pnn));
2980 /* verify the vnnmap generation is the same */
2981 if (vnnmap->generation != remote_vnnmap->generation) {
2982 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2983 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2984 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2988 /* verify the vnnmap size is the same */
2989 if (vnnmap->size != remote_vnnmap->size) {
2990 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2991 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2992 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2996 /* verify the vnnmap is the same */
2997 for (i=0;i<vnnmap->size;i++) {
2998 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2999 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3000 nodemap->nodes[j].pnn));
3001 do_recovery(rec, mem_ctx, pnn, nodemap,
3002 vnnmap, nodemap->nodes[j].pnn);
3008 /* we might need to change who has what IP assigned */
3009 if (rec->need_takeover_run) {
3010 rec->need_takeover_run = false;
3012 /* execute the "startrecovery" event script on all nodes */
3013 ret = run_startrecovery_eventscript(rec, nodemap);
3015 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3016 do_recovery(rec, mem_ctx, pnn, nodemap,
3020 ret = ctdb_takeover_run(ctdb, nodemap);
3022 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3023 do_recovery(rec, mem_ctx, pnn, nodemap,
3027 /* execute the "recovered" event script on all nodes */
3028 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3030 // we cant check whether the event completed successfully
3031 // since this script WILL fail if the node is in recovery mode
3032 // and if that race happens, the code here would just cause a second
3033 // cascading recovery.
3035 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3036 do_recovery(rec, mem_ctx, pnn, nodemap,
3048 event handler for when the main ctdbd dies
3050 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3051 uint16_t flags, void *private_data)
3053 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3058 called regularly to verify that the recovery daemon is still running
3060 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3061 struct timeval yt, void *p)
3063 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3065 if (kill(ctdb->recoverd_pid, 0) != 0) {
3066 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3068 ctdb_stop_recoverd(ctdb);
3069 ctdb_stop_keepalive(ctdb);
3070 ctdb_stop_monitoring(ctdb);
3071 ctdb_release_all_ips(ctdb);
3072 if (ctdb->methods != NULL) {
3073 ctdb->methods->shutdown(ctdb);
3075 ctdb_event_script(ctdb, "shutdown");
3080 event_add_timed(ctdb->ev, ctdb,
3081 timeval_current_ofs(30, 0),
3082 ctdb_check_recd, ctdb);
3085 static void recd_sig_child_handler(struct event_context *ev,
3086 struct signal_event *se, int signum, int count,
3090 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3095 pid = waitpid(-1, &status, WNOHANG);
3097 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
3101 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3107 startup the recovery daemon as a child of the main ctdb daemon
3109 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3112 struct signal_event *se;
3114 if (pipe(fd) != 0) {
3118 ctdb->ctdbd_pid = getpid();
3120 ctdb->recoverd_pid = fork();
3121 if (ctdb->recoverd_pid == -1) {
3125 if (ctdb->recoverd_pid != 0) {
3127 event_add_timed(ctdb->ev, ctdb,
3128 timeval_current_ofs(30, 0),
3129 ctdb_check_recd, ctdb);
3135 srandom(getpid() ^ time(NULL));
3137 if (switch_from_server_to_client(ctdb) != 0) {
3138 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3142 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3143 ctdb_recoverd_parent, &fd[0]);
3145 /* set up a handler to pick up sigchld */
3146 se = event_add_signal(ctdb->ev, ctdb,
3148 recd_sig_child_handler,
3151 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3155 monitor_cluster(ctdb);
3157 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3162 shutdown the recovery daemon
3164 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3166 if (ctdb->recoverd_pid == 0) {
3170 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3171 kill(ctdb->recoverd_pid, SIGTERM);