4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
235 nodes, CONTROL_TIMEOUT(),
237 async_getcap_callback, NULL,
239 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
240 talloc_free(tmp_ctx);
244 talloc_free(tmp_ctx);
248 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
250 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
252 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
253 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 change recovery mode on all nodes
259 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
265 tmp_ctx = talloc_new(ctdb);
266 CTDB_NO_MEMORY(ctdb, tmp_ctx);
268 /* freeze all nodes */
269 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
270 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
271 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
272 nodes, CONTROL_TIMEOUT(),
275 set_recmode_fail_callback,
277 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
278 talloc_free(tmp_ctx);
284 data.dsize = sizeof(uint32_t);
285 data.dptr = (unsigned char *)&rec_mode;
287 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
288 nodes, CONTROL_TIMEOUT(),
292 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
293 talloc_free(tmp_ctx);
297 talloc_free(tmp_ctx);
302 change recovery master on all node
304 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
310 tmp_ctx = talloc_new(ctdb);
311 CTDB_NO_MEMORY(ctdb, tmp_ctx);
313 data.dsize = sizeof(uint32_t);
314 data.dptr = (unsigned char *)&pnn;
316 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
317 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
319 CONTROL_TIMEOUT(), false, data,
322 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
323 talloc_free(tmp_ctx);
327 talloc_free(tmp_ctx);
331 /* update all remote nodes to use the same db priority that we have
332 this can fail if the remove node has not yet been upgraded to
333 support this function, so we always return success and never fail
334 a recovery if this call fails.
336 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
337 struct ctdb_node_map *nodemap,
338 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
343 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
345 /* step through all local databases */
346 for (db=0; db<dbmap->num;db++) {
348 struct ctdb_db_priority db_prio;
351 db_prio.db_id = dbmap->dbs[db].dbid;
352 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
354 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
358 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
360 data.dptr = (uint8_t *)&db_prio;
361 data.dsize = sizeof(db_prio);
363 if (ctdb_client_async_control(ctdb,
364 CTDB_CONTROL_SET_DB_PRIORITY,
366 CONTROL_TIMEOUT(), false, data,
369 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
377 ensure all other nodes have attached to any databases that we have
379 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
380 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
383 struct ctdb_dbid_map *remote_dbmap;
385 /* verify that all other nodes have all our databases */
386 for (j=0; j<nodemap->num; j++) {
387 /* we dont need to ourself ourselves */
388 if (nodemap->nodes[j].pnn == pnn) {
391 /* dont check nodes that are unavailable */
392 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
396 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
397 mem_ctx, &remote_dbmap);
399 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
403 /* step through all local databases */
404 for (db=0; db<dbmap->num;db++) {
408 for (i=0;i<remote_dbmap->num;i++) {
409 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
413 /* the remote node already have this database */
414 if (i!=remote_dbmap->num) {
417 /* ok so we need to create this database */
418 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
421 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
424 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
425 mem_ctx, name, dbmap->dbs[db].persistent);
427 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
438 ensure we are attached to any databases that anyone else is attached to
440 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
441 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
444 struct ctdb_dbid_map *remote_dbmap;
446 /* verify that we have all database any other node has */
447 for (j=0; j<nodemap->num; j++) {
448 /* we dont need to ourself ourselves */
449 if (nodemap->nodes[j].pnn == pnn) {
452 /* dont check nodes that are unavailable */
453 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
457 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
458 mem_ctx, &remote_dbmap);
460 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
464 /* step through all databases on the remote node */
465 for (db=0; db<remote_dbmap->num;db++) {
468 for (i=0;i<(*dbmap)->num;i++) {
469 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
473 /* we already have this db locally */
474 if (i!=(*dbmap)->num) {
477 /* ok so we need to create this database and
480 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
481 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
483 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
484 nodemap->nodes[j].pnn));
487 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
488 remote_dbmap->dbs[db].persistent);
490 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
493 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
495 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
506 pull the remote database contents from one node into the recdb
508 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
509 struct tdb_wrap *recdb, uint32_t dbid)
513 struct ctdb_marshall_buffer *reply;
514 struct ctdb_rec_data *rec;
516 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
518 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
519 CONTROL_TIMEOUT(), &outdata);
521 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
522 talloc_free(tmp_ctx);
526 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
528 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
529 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
530 talloc_free(tmp_ctx);
534 rec = (struct ctdb_rec_data *)&reply->data[0];
538 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
540 struct ctdb_ltdb_header *hdr;
543 key.dptr = &rec->data[0];
544 key.dsize = rec->keylen;
545 data.dptr = &rec->data[key.dsize];
546 data.dsize = rec->datalen;
548 hdr = (struct ctdb_ltdb_header *)data.dptr;
550 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
551 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
552 talloc_free(tmp_ctx);
556 /* fetch the existing record, if any */
557 existing = tdb_fetch(recdb->tdb, key);
559 if (existing.dptr != NULL) {
560 struct ctdb_ltdb_header header;
561 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
562 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
563 (unsigned)existing.dsize, srcnode));
565 talloc_free(tmp_ctx);
568 header = *(struct ctdb_ltdb_header *)existing.dptr;
570 if (!(header.rsn < hdr->rsn ||
571 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
576 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
577 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
578 talloc_free(tmp_ctx);
583 talloc_free(tmp_ctx);
589 pull all the remote database contents into the recdb
591 static int pull_remote_database(struct ctdb_context *ctdb,
592 struct ctdb_recoverd *rec,
593 struct ctdb_node_map *nodemap,
594 struct tdb_wrap *recdb, uint32_t dbid)
598 /* pull all records from all other nodes across onto this node
599 (this merges based on rsn)
601 for (j=0; j<nodemap->num; j++) {
602 /* dont merge from nodes that are unavailable */
603 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
606 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
607 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
608 nodemap->nodes[j].pnn));
609 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
619 update flags on all active nodes
621 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
625 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
627 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
635 ensure all nodes have the same vnnmap we do
637 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
638 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
642 /* push the new vnn map out to all the nodes */
643 for (j=0; j<nodemap->num; j++) {
644 /* dont push to nodes that are unavailable */
645 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
649 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
651 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
661 struct vacuum_info *next, *prev;
662 struct ctdb_recoverd *rec;
664 struct ctdb_db_context *ctdb_db;
665 struct ctdb_marshall_buffer *recs;
666 struct ctdb_rec_data *r;
669 static void vacuum_fetch_next(struct vacuum_info *v);
672 called when a vacuum fetch has completed - just free it and do the next one
674 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
676 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
678 vacuum_fetch_next(v);
683 process the next element from the vacuum list
685 static void vacuum_fetch_next(struct vacuum_info *v)
687 struct ctdb_call call;
688 struct ctdb_rec_data *r;
690 while (v->recs->count) {
691 struct ctdb_client_call_state *state;
693 struct ctdb_ltdb_header *hdr;
696 call.call_id = CTDB_NULL_FUNC;
697 call.flags = CTDB_IMMEDIATE_MIGRATION;
700 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
703 call.key.dptr = &r->data[0];
704 call.key.dsize = r->keylen;
706 /* ensure we don't block this daemon - just skip a record if we can't get
708 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
712 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
713 if (data.dptr == NULL) {
714 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
718 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
720 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
724 hdr = (struct ctdb_ltdb_header *)data.dptr;
725 if (hdr->dmaster == v->rec->ctdb->pnn) {
726 /* its already local */
728 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
734 state = ctdb_call_send(v->ctdb_db, &call);
735 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
737 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
741 state->async.fn = vacuum_fetch_callback;
742 state->async.private_data = v;
751 destroy a vacuum info structure
753 static int vacuum_info_destructor(struct vacuum_info *v)
755 DLIST_REMOVE(v->rec->vacuum_info, v);
761 handler for vacuum fetch
763 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
764 TDB_DATA data, void *private_data)
766 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
767 struct ctdb_marshall_buffer *recs;
769 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
771 struct ctdb_dbid_map *dbmap=NULL;
772 bool persistent = false;
773 struct ctdb_db_context *ctdb_db;
774 struct ctdb_rec_data *r;
776 struct vacuum_info *v;
778 recs = (struct ctdb_marshall_buffer *)data.dptr;
779 r = (struct ctdb_rec_data *)&recs->data[0];
781 if (recs->count == 0) {
782 talloc_free(tmp_ctx);
788 for (v=rec->vacuum_info;v;v=v->next) {
789 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
790 /* we're already working on records from this node */
791 talloc_free(tmp_ctx);
796 /* work out if the database is persistent */
797 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
799 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
800 talloc_free(tmp_ctx);
804 for (i=0;i<dbmap->num;i++) {
805 if (dbmap->dbs[i].dbid == recs->db_id) {
806 persistent = dbmap->dbs[i].persistent;
810 if (i == dbmap->num) {
811 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
812 talloc_free(tmp_ctx);
816 /* find the name of this database */
817 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
818 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
819 talloc_free(tmp_ctx);
824 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
825 if (ctdb_db == NULL) {
826 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
827 talloc_free(tmp_ctx);
831 v = talloc_zero(rec, struct vacuum_info);
833 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
834 talloc_free(tmp_ctx);
839 v->srcnode = srcnode;
840 v->ctdb_db = ctdb_db;
841 v->recs = talloc_memdup(v, recs, data.dsize);
842 if (v->recs == NULL) {
843 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
845 talloc_free(tmp_ctx);
848 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
850 DLIST_ADD(rec->vacuum_info, v);
852 talloc_set_destructor(v, vacuum_info_destructor);
854 vacuum_fetch_next(v);
855 talloc_free(tmp_ctx);
860 called when ctdb_wait_timeout should finish
862 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
863 struct timeval yt, void *p)
865 uint32_t *timed_out = (uint32_t *)p;
870 wait for a given number of seconds
872 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
874 uint32_t timed_out = 0;
875 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
877 event_loop_once(ctdb->ev);
882 called when an election times out (ends)
884 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
885 struct timeval t, void *p)
887 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
888 rec->election_timeout = NULL;
890 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
895 wait for an election to finish. It finished election_timeout seconds after
896 the last election packet is received
898 static void ctdb_wait_election(struct ctdb_recoverd *rec)
900 struct ctdb_context *ctdb = rec->ctdb;
901 while (rec->election_timeout) {
902 event_loop_once(ctdb->ev);
907 Update our local flags from all remote connected nodes.
908 This is only run when we are or we belive we are the recovery master
910 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
913 struct ctdb_context *ctdb = rec->ctdb;
914 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
916 /* get the nodemap for all active remote nodes and verify
917 they are the same as for this node
919 for (j=0; j<nodemap->num; j++) {
920 struct ctdb_node_map *remote_nodemap=NULL;
923 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
926 if (nodemap->nodes[j].pnn == ctdb->pnn) {
930 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
931 mem_ctx, &remote_nodemap);
933 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
934 nodemap->nodes[j].pnn));
935 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
936 talloc_free(mem_ctx);
937 return MONITOR_FAILED;
939 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
940 /* We should tell our daemon about this so it
941 updates its flags or else we will log the same
942 message again in the next iteration of recovery.
943 Since we are the recovery master we can just as
944 well update the flags on all nodes.
946 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
948 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
952 /* Update our local copy of the flags in the recovery
955 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
956 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
957 nodemap->nodes[j].flags));
958 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
960 talloc_free(remote_nodemap);
962 talloc_free(mem_ctx);
967 /* Create a new random generation ip.
968 The generation id can not be the INVALID_GENERATION id
970 static uint32_t new_generation(void)
975 generation = random();
977 if (generation != INVALID_GENERATION) {
987 create a temporary working database
989 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
992 struct tdb_wrap *recdb;
995 /* open up the temporary recovery database */
996 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1002 tdb_flags = TDB_NOLOCK;
1003 if (!ctdb->do_setsched) {
1004 tdb_flags |= TDB_NOMMAP;
1007 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1008 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1009 if (recdb == NULL) {
1010 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1020 a traverse function for pulling all relevent records from recdb
1023 struct ctdb_context *ctdb;
1024 struct ctdb_marshall_buffer *recdata;
1029 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1031 struct recdb_data *params = (struct recdb_data *)p;
1032 struct ctdb_rec_data *rec;
1033 struct ctdb_ltdb_header *hdr;
1035 /* skip empty records */
1036 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1040 /* update the dmaster field to point to us */
1041 hdr = (struct ctdb_ltdb_header *)data.dptr;
1042 hdr->dmaster = params->ctdb->pnn;
1044 /* add the record to the blob ready to send to the nodes */
1045 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1047 params->failed = true;
1050 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1051 if (params->recdata == NULL) {
1052 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1053 rec->length + params->len, params->recdata->count));
1054 params->failed = true;
1057 params->recdata->count++;
1058 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1059 params->len += rec->length;
1066 push the recdb database out to all nodes
1068 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1069 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1071 struct recdb_data params;
1072 struct ctdb_marshall_buffer *recdata;
1074 TALLOC_CTX *tmp_ctx;
1077 tmp_ctx = talloc_new(ctdb);
1078 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1080 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1081 CTDB_NO_MEMORY(ctdb, recdata);
1083 recdata->db_id = dbid;
1086 params.recdata = recdata;
1087 params.len = offsetof(struct ctdb_marshall_buffer, data);
1088 params.failed = false;
1090 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1091 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1092 talloc_free(params.recdata);
1093 talloc_free(tmp_ctx);
1097 if (params.failed) {
1098 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1099 talloc_free(params.recdata);
1100 talloc_free(tmp_ctx);
1104 recdata = params.recdata;
1106 outdata.dptr = (void *)recdata;
1107 outdata.dsize = params.len;
1109 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1110 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1112 CONTROL_TIMEOUT(), false, outdata,
1115 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1116 talloc_free(recdata);
1117 talloc_free(tmp_ctx);
1121 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1122 dbid, recdata->count));
1124 talloc_free(recdata);
1125 talloc_free(tmp_ctx);
1132 go through a full recovery on one database
1134 static int recover_database(struct ctdb_recoverd *rec,
1135 TALLOC_CTX *mem_ctx,
1138 struct ctdb_node_map *nodemap,
1139 uint32_t transaction_id)
1141 struct tdb_wrap *recdb;
1143 struct ctdb_context *ctdb = rec->ctdb;
1145 struct ctdb_control_wipe_database w;
1148 recdb = create_recdb(ctdb, mem_ctx);
1149 if (recdb == NULL) {
1153 /* pull all remote databases onto the recdb */
1154 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1156 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1160 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1162 /* wipe all the remote databases. This is safe as we are in a transaction */
1164 w.transaction_id = transaction_id;
1166 data.dptr = (void *)&w;
1167 data.dsize = sizeof(w);
1169 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1170 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1172 CONTROL_TIMEOUT(), false, data,
1175 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1180 /* push out the correct database. This sets the dmaster and skips
1181 the empty records */
1182 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1188 /* all done with this database */
1195 reload the nodes file
1197 static void reload_nodes_file(struct ctdb_context *ctdb)
1200 ctdb_load_nodes_file(ctdb);
1205 we are the recmaster, and recovery is needed - start a recovery run
1207 static int do_recovery(struct ctdb_recoverd *rec,
1208 TALLOC_CTX *mem_ctx, uint32_t pnn,
1209 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1211 struct ctdb_context *ctdb = rec->ctdb;
1213 uint32_t generation;
1214 struct ctdb_dbid_map *dbmap;
1217 struct timeval start_time;
1219 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1221 /* if recovery fails, force it again */
1222 rec->need_recovery = true;
1224 for (i=0; i<ctdb->num_nodes; i++) {
1225 struct ctdb_banning_state *ban_state;
1227 if (ctdb->nodes[i]->ban_state == NULL) {
1230 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1231 if (ban_state->count < 2*ctdb->num_nodes) {
1234 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1235 ctdb->nodes[i]->pnn, ban_state->count,
1236 ctdb->tunable.recovery_ban_period));
1237 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1238 ban_state->count = 0;
1242 if (ctdb->tunable.verify_recovery_lock != 0) {
1243 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1244 start_time = timeval_current();
1245 if (!ctdb_recovery_lock(ctdb, true)) {
1246 ctdb_set_culprit(rec, pnn);
1247 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1250 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1251 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1254 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1256 /* get a list of all databases */
1257 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1259 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1263 /* we do the db creation before we set the recovery mode, so the freeze happens
1264 on all databases we will be dealing with. */
1266 /* verify that we have all the databases any other node has */
1267 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1269 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1273 /* verify that all other nodes have all our databases */
1274 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1276 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1279 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1281 /* update the database priority for all remote databases */
1282 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1284 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1286 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1289 /* set recovery mode to active on all nodes */
1290 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1292 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1296 /* execute the "startrecovery" event script on all nodes */
1297 ret = run_startrecovery_eventscript(rec, nodemap);
1299 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1303 /* pick a new generation number */
1304 generation = new_generation();
1306 /* change the vnnmap on this node to use the new generation
1307 number but not on any other nodes.
1308 this guarantees that if we abort the recovery prematurely
1309 for some reason (a node stops responding?)
1310 that we can just return immediately and we will reenter
1311 recovery shortly again.
1312 I.e. we deliberately leave the cluster with an inconsistent
1313 generation id to allow us to abort recovery at any stage and
1314 just restart it from scratch.
1316 vnnmap->generation = generation;
1317 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1319 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1323 data.dptr = (void *)&generation;
1324 data.dsize = sizeof(uint32_t);
1326 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1327 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1329 CONTROL_TIMEOUT(), false, data,
1332 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1336 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1338 for (i=0;i<dbmap->num;i++) {
1339 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1340 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1345 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1347 /* commit all the changes */
1348 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1350 CONTROL_TIMEOUT(), false, data,
1353 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1357 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1360 /* update the capabilities for all nodes */
1361 ret = update_capabilities(ctdb, nodemap);
1363 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1367 /* build a new vnn map with all the currently active and
1369 generation = new_generation();
1370 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1371 CTDB_NO_MEMORY(ctdb, vnnmap);
1372 vnnmap->generation = generation;
1374 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1375 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1376 for (i=j=0;i<nodemap->num;i++) {
1377 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1380 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1381 /* this node can not be an lmaster */
1382 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1387 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1388 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1389 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1392 if (vnnmap->size == 0) {
1393 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1395 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1396 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1397 vnnmap->map[0] = pnn;
1400 /* update to the new vnnmap on all nodes */
1401 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1403 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1407 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1409 /* update recmaster to point to us for all nodes */
1410 ret = set_recovery_master(ctdb, nodemap, pnn);
1412 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1416 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1419 update all nodes to have the same flags that we have
1421 for (i=0;i<nodemap->num;i++) {
1422 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1426 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1428 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1433 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1435 /* disable recovery mode */
1436 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1438 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1442 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1445 tell nodes to takeover their public IPs
1447 rec->need_takeover_run = false;
1448 ret = ctdb_takeover_run(ctdb, nodemap);
1450 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1453 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1455 /* execute the "recovered" event script on all nodes */
1456 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1458 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1462 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1464 /* send a message to all clients telling them that the cluster
1465 has been reconfigured */
1466 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1468 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1470 rec->need_recovery = false;
1472 /* we managed to complete a full recovery, make sure to forgive
1473 any past sins by the nodes that could now participate in the
1476 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1477 for (i=0;i<nodemap->num;i++) {
1478 struct ctdb_banning_state *ban_state;
1480 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1484 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1485 if (ban_state == NULL) {
1489 ban_state->count = 0;
1493 /* We just finished a recovery successfully.
1494 We now wait for rerecovery_timeout before we allow
1495 another recovery to take place.
1497 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1498 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1499 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1506 elections are won by first checking the number of connected nodes, then
1507 the priority time, then the pnn
1509 struct election_message {
1510 uint32_t num_connected;
1511 struct timeval priority_time;
1513 uint32_t node_flags;
1517 form this nodes election data
1519 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1522 struct ctdb_node_map *nodemap;
1523 struct ctdb_context *ctdb = rec->ctdb;
1527 em->pnn = rec->ctdb->pnn;
1528 em->priority_time = rec->priority_time;
1530 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1532 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1536 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1537 em->node_flags = rec->node_flags;
1539 for (i=0;i<nodemap->num;i++) {
1540 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1541 em->num_connected++;
1545 /* we shouldnt try to win this election if we cant be a recmaster */
1546 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1547 em->num_connected = 0;
1548 em->priority_time = timeval_current();
1551 talloc_free(nodemap);
1555 see if the given election data wins
1557 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1559 struct election_message myem;
1562 ctdb_election_data(rec, &myem);
1564 /* we cant win if we dont have the recmaster capability */
1565 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1569 /* we cant win if we are banned */
1570 if (rec->node_flags & NODE_FLAGS_BANNED) {
1574 /* we cant win if we are stopped */
1575 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1579 /* we will automatically win if the other node is banned */
1580 if (em->node_flags & NODE_FLAGS_BANNED) {
1584 /* we will automatically win if the other node is banned */
1585 if (em->node_flags & NODE_FLAGS_STOPPED) {
1589 /* try to use the most connected node */
1591 cmp = (int)myem.num_connected - (int)em->num_connected;
1594 /* then the longest running node */
1596 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1600 cmp = (int)myem.pnn - (int)em->pnn;
1607 send out an election request
1609 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1612 TDB_DATA election_data;
1613 struct election_message emsg;
1615 struct ctdb_context *ctdb = rec->ctdb;
1617 srvid = CTDB_SRVID_RECOVERY;
1619 ctdb_election_data(rec, &emsg);
1621 election_data.dsize = sizeof(struct election_message);
1622 election_data.dptr = (unsigned char *)&emsg;
1625 /* send an election message to all active nodes */
1626 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1627 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1630 /* A new node that is already frozen has entered the cluster.
1631 The existing nodes are not frozen and dont need to be frozen
1632 until the election has ended and we start the actual recovery
1634 if (update_recmaster == true) {
1635 /* first we assume we will win the election and set
1636 recoverymaster to be ourself on the current node
1638 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1640 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1650 this function will unban all nodes in the cluster
1652 static void unban_all_nodes(struct ctdb_context *ctdb)
1655 struct ctdb_node_map *nodemap;
1656 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1658 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1660 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1664 for (i=0;i<nodemap->num;i++) {
1665 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1666 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1667 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1671 talloc_free(tmp_ctx);
1676 we think we are winning the election - send a broadcast election request
1678 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1680 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1683 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1685 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1688 talloc_free(rec->send_election_te);
1689 rec->send_election_te = NULL;
1693 handler for memory dumps
1695 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1696 TDB_DATA data, void *private_data)
1698 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1701 struct rd_memdump_reply *rd;
1703 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1704 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1705 talloc_free(tmp_ctx);
1708 rd = (struct rd_memdump_reply *)data.dptr;
1710 dump = talloc_zero(tmp_ctx, TDB_DATA);
1712 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1713 talloc_free(tmp_ctx);
1716 ret = ctdb_dump_memory(ctdb, dump);
1718 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1719 talloc_free(tmp_ctx);
1723 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1725 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1727 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1728 talloc_free(tmp_ctx);
1732 talloc_free(tmp_ctx);
1736 handler for reload_nodes
1738 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1739 TDB_DATA data, void *private_data)
1741 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1743 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1745 reload_nodes_file(rec->ctdb);
1749 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1750 struct timeval yt, void *p)
1752 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1754 talloc_free(rec->ip_check_disable_ctx);
1755 rec->ip_check_disable_ctx = NULL;
1758 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1759 TDB_DATA data, void *private_data)
1761 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1764 if (rec->ip_check_disable_ctx != NULL) {
1765 talloc_free(rec->ip_check_disable_ctx);
1766 rec->ip_check_disable_ctx = NULL;
1769 if (data.dsize != sizeof(uint32_t)) {
1770 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu expexting %lu\n", data.dsize, sizeof(uint32_t)));
1773 if (data.dptr == NULL) {
1774 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1778 timeout = *((uint32_t *)data.dptr);
1779 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1781 rec->ip_check_disable_ctx = talloc_new(rec);
1782 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1784 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1789 handler for ip reallocate, just add it to the list of callers and
1790 handle this later in the monitor_cluster loop so we do not recurse
1791 with other callers to takeover_run()
1793 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1794 TDB_DATA data, void *private_data)
1796 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1797 struct ip_reallocate_list *caller;
1799 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1800 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1804 if (rec->ip_reallocate_ctx == NULL) {
1805 rec->ip_reallocate_ctx = talloc_new(rec);
1806 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1809 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1810 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1812 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1813 caller->next = rec->reallocate_callers;
1814 rec->reallocate_callers = caller;
1819 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1821 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1824 struct ip_reallocate_list *callers;
1826 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1827 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1828 result.dsize = sizeof(int32_t);
1829 result.dptr = (uint8_t *)&ret;
1831 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1832 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1833 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1835 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1839 talloc_free(tmp_ctx);
1840 talloc_free(rec->ip_reallocate_ctx);
1841 rec->ip_reallocate_ctx = NULL;
1842 rec->reallocate_callers = NULL;
1848 handler for recovery master elections
1850 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1851 TDB_DATA data, void *private_data)
1853 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1855 struct election_message *em = (struct election_message *)data.dptr;
1856 TALLOC_CTX *mem_ctx;
1858 /* we got an election packet - update the timeout for the election */
1859 talloc_free(rec->election_timeout);
1860 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1861 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1862 ctdb_election_timeout, rec);
1864 mem_ctx = talloc_new(ctdb);
1866 /* someone called an election. check their election data
1867 and if we disagree and we would rather be the elected node,
1868 send a new election message to all other nodes
1870 if (ctdb_election_win(rec, em)) {
1871 if (!rec->send_election_te) {
1872 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1873 timeval_current_ofs(0, 500000),
1874 election_send_request, rec);
1876 talloc_free(mem_ctx);
1877 /*unban_all_nodes(ctdb);*/
1882 talloc_free(rec->send_election_te);
1883 rec->send_election_te = NULL;
1885 if (ctdb->tunable.verify_recovery_lock != 0) {
1886 /* release the recmaster lock */
1887 if (em->pnn != ctdb->pnn &&
1888 ctdb->recovery_lock_fd != -1) {
1889 close(ctdb->recovery_lock_fd);
1890 ctdb->recovery_lock_fd = -1;
1891 unban_all_nodes(ctdb);
1895 /* ok, let that guy become recmaster then */
1896 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1898 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1899 talloc_free(mem_ctx);
1903 talloc_free(mem_ctx);
1909 force the start of the election process
1911 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1912 struct ctdb_node_map *nodemap)
1915 struct ctdb_context *ctdb = rec->ctdb;
1917 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1919 /* set all nodes to recovery mode to stop all internode traffic */
1920 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1922 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1926 talloc_free(rec->election_timeout);
1927 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1928 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1929 ctdb_election_timeout, rec);
1931 ret = send_election_request(rec, pnn, true);
1933 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1937 /* wait for a few seconds to collect all responses */
1938 ctdb_wait_election(rec);
1944 handler for when a node changes its flags
1946 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1947 TDB_DATA data, void *private_data)
1950 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1951 struct ctdb_node_map *nodemap=NULL;
1952 TALLOC_CTX *tmp_ctx;
1953 uint32_t changed_flags;
1955 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1956 int disabled_flag_changed;
1958 if (data.dsize != sizeof(*c)) {
1959 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1963 tmp_ctx = talloc_new(ctdb);
1964 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1966 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1968 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1969 talloc_free(tmp_ctx);
1974 for (i=0;i<nodemap->num;i++) {
1975 if (nodemap->nodes[i].pnn == c->pnn) break;
1978 if (i == nodemap->num) {
1979 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1980 talloc_free(tmp_ctx);
1984 changed_flags = c->old_flags ^ c->new_flags;
1986 if (nodemap->nodes[i].flags != c->new_flags) {
1987 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1990 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
1992 nodemap->nodes[i].flags = c->new_flags;
1994 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1995 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1998 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1999 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2003 ctdb->recovery_master == ctdb->pnn &&
2004 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2005 /* Only do the takeover run if the perm disabled or unhealthy
2006 flags changed since these will cause an ip failover but not
2008 If the node became disconnected or banned this will also
2009 lead to an ip address failover but that is handled
2012 if (disabled_flag_changed) {
2013 rec->need_takeover_run = true;
2017 talloc_free(tmp_ctx);
2021 handler for when we need to push out flag changes ot all other nodes
2023 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2024 TDB_DATA data, void *private_data)
2027 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2029 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2031 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2036 struct verify_recmode_normal_data {
2038 enum monitor_result status;
2041 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2043 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2046 /* one more node has responded with recmode data*/
2049 /* if we failed to get the recmode, then return an error and let
2050 the main loop try again.
2052 if (state->state != CTDB_CONTROL_DONE) {
2053 if (rmdata->status == MONITOR_OK) {
2054 rmdata->status = MONITOR_FAILED;
2059 /* if we got a response, then the recmode will be stored in the
2062 if (state->status != CTDB_RECOVERY_NORMAL) {
2063 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2064 rmdata->status = MONITOR_RECOVERY_NEEDED;
2071 /* verify that all nodes are in normal recovery mode */
2072 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2074 struct verify_recmode_normal_data *rmdata;
2075 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2076 struct ctdb_client_control_state *state;
2077 enum monitor_result status;
2080 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2081 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2083 rmdata->status = MONITOR_OK;
2085 /* loop over all active nodes and send an async getrecmode call to
2087 for (j=0; j<nodemap->num; j++) {
2088 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2091 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2093 nodemap->nodes[j].pnn);
2094 if (state == NULL) {
2095 /* we failed to send the control, treat this as
2096 an error and try again next iteration
2098 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2099 talloc_free(mem_ctx);
2100 return MONITOR_FAILED;
2103 /* set up the callback functions */
2104 state->async.fn = verify_recmode_normal_callback;
2105 state->async.private_data = rmdata;
2107 /* one more control to wait for to complete */
2112 /* now wait for up to the maximum number of seconds allowed
2113 or until all nodes we expect a response from has replied
2115 while (rmdata->count > 0) {
2116 event_loop_once(ctdb->ev);
2119 status = rmdata->status;
2120 talloc_free(mem_ctx);
2125 struct verify_recmaster_data {
2126 struct ctdb_recoverd *rec;
2129 enum monitor_result status;
2132 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2134 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2137 /* one more node has responded with recmaster data*/
2140 /* if we failed to get the recmaster, then return an error and let
2141 the main loop try again.
2143 if (state->state != CTDB_CONTROL_DONE) {
2144 if (rmdata->status == MONITOR_OK) {
2145 rmdata->status = MONITOR_FAILED;
2150 /* if we got a response, then the recmaster will be stored in the
2153 if (state->status != rmdata->pnn) {
2154 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2155 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2156 rmdata->status = MONITOR_ELECTION_NEEDED;
2163 /* verify that all nodes agree that we are the recmaster */
2164 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2166 struct ctdb_context *ctdb = rec->ctdb;
2167 struct verify_recmaster_data *rmdata;
2168 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2169 struct ctdb_client_control_state *state;
2170 enum monitor_result status;
2173 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2174 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2178 rmdata->status = MONITOR_OK;
2180 /* loop over all active nodes and send an async getrecmaster call to
2182 for (j=0; j<nodemap->num; j++) {
2183 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2186 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2188 nodemap->nodes[j].pnn);
2189 if (state == NULL) {
2190 /* we failed to send the control, treat this as
2191 an error and try again next iteration
2193 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2194 talloc_free(mem_ctx);
2195 return MONITOR_FAILED;
2198 /* set up the callback functions */
2199 state->async.fn = verify_recmaster_callback;
2200 state->async.private_data = rmdata;
2202 /* one more control to wait for to complete */
2207 /* now wait for up to the maximum number of seconds allowed
2208 or until all nodes we expect a response from has replied
2210 while (rmdata->count > 0) {
2211 event_loop_once(ctdb->ev);
2214 status = rmdata->status;
2215 talloc_free(mem_ctx);
2220 /* called to check that the allocation of public ip addresses is ok.
2222 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2224 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2225 struct ctdb_all_public_ips *ips = NULL;
2226 struct ctdb_uptime *uptime1 = NULL;
2227 struct ctdb_uptime *uptime2 = NULL;
2230 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2231 CTDB_CURRENT_NODE, &uptime1);
2233 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2234 talloc_free(mem_ctx);
2238 /* read the ip allocation from the local node */
2239 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2241 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2242 talloc_free(mem_ctx);
2246 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2247 CTDB_CURRENT_NODE, &uptime2);
2249 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2250 talloc_free(mem_ctx);
2254 /* skip the check if the startrecovery time has changed */
2255 if (timeval_compare(&uptime1->last_recovery_started,
2256 &uptime2->last_recovery_started) != 0) {
2257 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2258 talloc_free(mem_ctx);
2262 /* skip the check if the endrecovery time has changed */
2263 if (timeval_compare(&uptime1->last_recovery_finished,
2264 &uptime2->last_recovery_finished) != 0) {
2265 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2266 talloc_free(mem_ctx);
2270 /* skip the check if we have started but not finished recovery */
2271 if (timeval_compare(&uptime1->last_recovery_finished,
2272 &uptime1->last_recovery_started) != 1) {
2273 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2274 talloc_free(mem_ctx);
2279 /* verify that we have the ip addresses we should have
2280 and we dont have ones we shouldnt have.
2281 if we find an inconsistency we set recmode to
2282 active on the local node and wait for the recmaster
2283 to do a full blown recovery
2285 for (j=0; j<ips->num; j++) {
2286 if (ips->ips[j].pnn == pnn) {
2287 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2288 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2289 ctdb_addr_to_str(&ips->ips[j].addr)));
2290 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2292 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2294 talloc_free(mem_ctx);
2297 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2299 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2301 talloc_free(mem_ctx);
2306 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2307 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2308 ctdb_addr_to_str(&ips->ips[j].addr)));
2310 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2312 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2314 talloc_free(mem_ctx);
2317 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2319 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2321 talloc_free(mem_ctx);
2328 talloc_free(mem_ctx);
2333 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2335 struct ctdb_node_map **remote_nodemaps = callback_data;
2337 if (node_pnn >= ctdb->num_nodes) {
2338 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2342 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2346 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2347 struct ctdb_node_map *nodemap,
2348 struct ctdb_node_map **remote_nodemaps)
2352 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2353 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2355 CONTROL_TIMEOUT(), false, tdb_null,
2356 async_getnodemap_callback,
2358 remote_nodemaps) != 0) {
2359 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2367 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2368 struct ctdb_check_reclock_state {
2369 struct ctdb_context *ctdb;
2370 struct timeval start_time;
2373 struct timed_event *te;
2374 struct fd_event *fde;
2375 enum reclock_child_status status;
2378 /* when we free the reclock state we must kill any child process.
2380 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2382 struct ctdb_context *ctdb = state->ctdb;
2384 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2386 if (state->fd[0] != -1) {
2387 close(state->fd[0]);
2390 if (state->fd[1] != -1) {
2391 close(state->fd[1]);
2394 kill(state->child, SIGKILL);
2399 called if our check_reclock child times out. this would happen if
2400 i/o to the reclock file blocks.
2402 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2403 struct timeval t, void *private_data)
2405 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2406 struct ctdb_check_reclock_state);
2408 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2409 state->status = RECLOCK_TIMEOUT;
2412 /* this is called when the child process has completed checking the reclock
2413 file and has written data back to us through the pipe.
2415 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2416 uint16_t flags, void *private_data)
2418 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2419 struct ctdb_check_reclock_state);
2423 /* we got a response from our child process so we can abort the
2426 talloc_free(state->te);
2429 ret = read(state->fd[0], &c, 1);
2430 if (ret != 1 || c != RECLOCK_OK) {
2431 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2432 state->status = RECLOCK_FAILED;
2437 state->status = RECLOCK_OK;
2441 static int check_recovery_lock(struct ctdb_context *ctdb)
2444 struct ctdb_check_reclock_state *state;
2445 pid_t parent = getpid();
2447 if (ctdb->recovery_lock_fd == -1) {
2448 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2452 state = talloc(ctdb, struct ctdb_check_reclock_state);
2453 CTDB_NO_MEMORY(ctdb, state);
2456 state->start_time = timeval_current();
2457 state->status = RECLOCK_CHECKING;
2461 ret = pipe(state->fd);
2464 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2468 state->child = fork();
2469 if (state->child == (pid_t)-1) {
2470 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2471 close(state->fd[0]);
2473 close(state->fd[1]);
2479 if (state->child == 0) {
2480 char cc = RECLOCK_OK;
2481 close(state->fd[0]);
2484 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2485 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2486 cc = RECLOCK_FAILED;
2489 write(state->fd[1], &cc, 1);
2490 /* make sure we die when our parent dies */
2491 while (kill(parent, 0) == 0 || errno != ESRCH) {
2493 write(state->fd[1], &cc, 1);
2497 close(state->fd[1]);
2500 talloc_set_destructor(state, check_reclock_destructor);
2502 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2503 ctdb_check_reclock_timeout, state);
2504 if (state->te == NULL) {
2505 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2510 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2511 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2512 reclock_child_handler,
2515 if (state->fde == NULL) {
2516 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2521 while (state->status == RECLOCK_CHECKING) {
2522 event_loop_once(ctdb->ev);
2525 if (state->status == RECLOCK_FAILED) {
2526 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2527 close(ctdb->recovery_lock_fd);
2528 ctdb->recovery_lock_fd = -1;
2537 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2539 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2540 const char *reclockfile;
2542 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2543 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2544 talloc_free(tmp_ctx);
2548 if (reclockfile == NULL) {
2549 if (ctdb->recovery_lock_file != NULL) {
2550 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2551 talloc_free(ctdb->recovery_lock_file);
2552 ctdb->recovery_lock_file = NULL;
2553 if (ctdb->recovery_lock_fd != -1) {
2554 close(ctdb->recovery_lock_fd);
2555 ctdb->recovery_lock_fd = -1;
2558 ctdb->tunable.verify_recovery_lock = 0;
2559 talloc_free(tmp_ctx);
2563 if (ctdb->recovery_lock_file == NULL) {
2564 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2565 if (ctdb->recovery_lock_fd != -1) {
2566 close(ctdb->recovery_lock_fd);
2567 ctdb->recovery_lock_fd = -1;
2569 talloc_free(tmp_ctx);
2574 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2575 talloc_free(tmp_ctx);
2579 talloc_free(ctdb->recovery_lock_file);
2580 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2581 ctdb->tunable.verify_recovery_lock = 0;
2582 if (ctdb->recovery_lock_fd != -1) {
2583 close(ctdb->recovery_lock_fd);
2584 ctdb->recovery_lock_fd = -1;
2587 talloc_free(tmp_ctx);
2592 the main monitoring loop
2594 static void monitor_cluster(struct ctdb_context *ctdb)
2597 TALLOC_CTX *mem_ctx=NULL;
2598 struct ctdb_node_map *nodemap=NULL;
2599 struct ctdb_node_map *recmaster_nodemap=NULL;
2600 struct ctdb_node_map **remote_nodemaps=NULL;
2601 struct ctdb_vnn_map *vnnmap=NULL;
2602 struct ctdb_vnn_map *remote_vnnmap=NULL;
2603 int32_t debug_level;
2605 struct ctdb_recoverd *rec;
2607 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2609 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2610 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2614 rec->priority_time = timeval_current();
2616 /* register a message port for sending memory dumps */
2617 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2619 /* register a message port for recovery elections */
2620 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2622 /* when nodes are disabled/enabled */
2623 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2625 /* when we are asked to puch out a flag change */
2626 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2628 /* register a message port for vacuum fetch */
2629 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2631 /* register a message port for reloadnodes */
2632 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2634 /* register a message port for performing a takeover run */
2635 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2637 /* register a message port for disabling the ip check for a short while */
2638 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2642 talloc_free(mem_ctx);
2645 mem_ctx = talloc_new(ctdb);
2647 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2651 /* we only check for recovery once every second */
2652 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2654 /* verify that the main daemon is still running */
2655 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2656 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2660 /* ping the local daemon to tell it we are alive */
2661 ctdb_ctrl_recd_ping(ctdb);
2663 if (rec->election_timeout) {
2664 /* an election is in progress */
2668 /* read the debug level from the parent and update locally */
2669 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2671 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2674 LogLevel = debug_level;
2677 /* We must check if we need to ban a node here but we want to do this
2678 as early as possible so we dont wait until we have pulled the node
2679 map from the local node. thats why we have the hardcoded value 20
2681 for (i=0; i<ctdb->num_nodes; i++) {
2682 struct ctdb_banning_state *ban_state;
2684 if (ctdb->nodes[i]->ban_state == NULL) {
2687 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2688 if (ban_state->count < 20) {
2691 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2692 ctdb->nodes[i]->pnn, ban_state->count,
2693 ctdb->tunable.recovery_ban_period));
2694 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2695 ban_state->count = 0;
2698 /* get relevant tunables */
2699 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2701 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2705 /* get the current recovery lock file from the server */
2706 if (update_recovery_lock_file(ctdb) != 0) {
2707 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2711 /* Make sure that if recovery lock verification becomes disabled when
2714 if (ctdb->tunable.verify_recovery_lock == 0) {
2715 if (ctdb->recovery_lock_fd != -1) {
2716 close(ctdb->recovery_lock_fd);
2717 ctdb->recovery_lock_fd = -1;
2721 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2722 if (pnn == (uint32_t)-1) {
2723 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2727 /* get the vnnmap */
2728 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2730 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2735 /* get number of nodes */
2737 talloc_free(rec->nodemap);
2738 rec->nodemap = NULL;
2741 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2743 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2746 nodemap = rec->nodemap;
2748 /* check which node is the recovery master */
2749 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2751 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2755 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2756 if (rec->recmaster != pnn) {
2757 if (rec->ip_reallocate_ctx != NULL) {
2758 talloc_free(rec->ip_reallocate_ctx);
2759 rec->ip_reallocate_ctx = NULL;
2760 rec->reallocate_callers = NULL;
2763 /* if there are takeovers requested, perform it and notify the waiters */
2764 if (rec->reallocate_callers) {
2765 process_ipreallocate_requests(ctdb, rec);
2768 if (rec->recmaster == (uint32_t)-1) {
2769 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2770 force_election(rec, pnn, nodemap);
2775 /* if the local daemon is STOPPED, we verify that the databases are
2776 also frozen and thet the recmode is set to active
2778 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2779 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2781 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2783 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2784 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2786 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2788 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2791 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2793 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2800 /* If the local node is stopped, verify we are not the recmaster
2801 and yield this role if so
2803 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2804 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2805 force_election(rec, pnn, nodemap);
2809 /* check that we (recovery daemon) and the local ctdb daemon
2810 agrees on whether we are banned or not
2814 /* remember our own node flags */
2815 rec->node_flags = nodemap->nodes[pnn].flags;
2817 /* count how many active nodes there are */
2818 rec->num_active = 0;
2819 rec->num_connected = 0;
2820 for (i=0; i<nodemap->num; i++) {
2821 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2824 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2825 rec->num_connected++;
2830 /* verify that the recmaster node is still active */
2831 for (j=0; j<nodemap->num; j++) {
2832 if (nodemap->nodes[j].pnn==rec->recmaster) {
2837 if (j == nodemap->num) {
2838 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2839 force_election(rec, pnn, nodemap);
2843 /* if recovery master is disconnected we must elect a new recmaster */
2844 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2845 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2846 force_election(rec, pnn, nodemap);
2850 /* grap the nodemap from the recovery master to check if it is banned */
2851 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2852 mem_ctx, &recmaster_nodemap);
2854 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2855 nodemap->nodes[j].pnn));
2860 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2861 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2862 force_election(rec, pnn, nodemap);
2867 /* verify that we have all ip addresses we should have and we dont
2868 * have addresses we shouldnt have.
2870 if (ctdb->do_checkpublicip) {
2871 if (rec->ip_check_disable_ctx == NULL) {
2872 if (verify_ip_allocation(ctdb, pnn) != 0) {
2873 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2880 /* if we are not the recmaster then we do not need to check
2881 if recovery is needed
2883 if (pnn != rec->recmaster) {
2888 /* ensure our local copies of flags are right */
2889 ret = update_local_flags(rec, nodemap);
2890 if (ret == MONITOR_ELECTION_NEEDED) {
2891 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2892 force_election(rec, pnn, nodemap);
2895 if (ret != MONITOR_OK) {
2896 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2900 /* update the list of public ips that a node can handle for
2903 if (ctdb->num_nodes != nodemap->num) {
2904 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2905 reload_nodes_file(ctdb);
2908 for (j=0; j<nodemap->num; j++) {
2909 /* release any existing data */
2910 if (ctdb->nodes[j]->public_ips) {
2911 talloc_free(ctdb->nodes[j]->public_ips);
2912 ctdb->nodes[j]->public_ips = NULL;
2915 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2919 /* grab a new shiny list of public ips from the node */
2920 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2921 ctdb->nodes[j]->pnn,
2923 &ctdb->nodes[j]->public_ips)) {
2924 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2925 ctdb->nodes[j]->pnn));
2931 /* verify that all active nodes agree that we are the recmaster */
2932 switch (verify_recmaster(rec, nodemap, pnn)) {
2933 case MONITOR_RECOVERY_NEEDED:
2934 /* can not happen */
2936 case MONITOR_ELECTION_NEEDED:
2937 force_election(rec, pnn, nodemap);
2941 case MONITOR_FAILED:
2946 if (rec->need_recovery) {
2947 /* a previous recovery didn't finish */
2948 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2952 /* verify that all active nodes are in normal mode
2953 and not in recovery mode
2955 switch (verify_recmode(ctdb, nodemap)) {
2956 case MONITOR_RECOVERY_NEEDED:
2957 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2959 case MONITOR_FAILED:
2961 case MONITOR_ELECTION_NEEDED:
2962 /* can not happen */
2968 if (ctdb->tunable.verify_recovery_lock != 0) {
2969 /* we should have the reclock - check its not stale */
2970 ret = check_recovery_lock(ctdb);
2972 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
2973 ctdb_set_culprit(rec, ctdb->pnn);
2974 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2979 /* get the nodemap for all active remote nodes
2981 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
2982 if (remote_nodemaps == NULL) {
2983 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2986 for(i=0; i<nodemap->num; i++) {
2987 remote_nodemaps[i] = NULL;
2989 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2990 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2994 /* verify that all other nodes have the same nodemap as we have
2996 for (j=0; j<nodemap->num; j++) {
2997 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3001 if (remote_nodemaps[j] == NULL) {
3002 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3003 ctdb_set_culprit(rec, j);
3008 /* if the nodes disagree on how many nodes there are
3009 then this is a good reason to try recovery
3011 if (remote_nodemaps[j]->num != nodemap->num) {
3012 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3013 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3014 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3015 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3019 /* if the nodes disagree on which nodes exist and are
3020 active, then that is also a good reason to do recovery
3022 for (i=0;i<nodemap->num;i++) {
3023 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3024 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3025 nodemap->nodes[j].pnn, i,
3026 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3027 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3028 do_recovery(rec, mem_ctx, pnn, nodemap,
3034 /* verify the flags are consistent
3036 for (i=0; i<nodemap->num; i++) {
3037 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3041 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3042 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3043 nodemap->nodes[j].pnn,
3044 nodemap->nodes[i].pnn,
3045 remote_nodemaps[j]->nodes[i].flags,
3046 nodemap->nodes[j].flags));
3048 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3049 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3050 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3051 do_recovery(rec, mem_ctx, pnn, nodemap,
3055 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3056 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3057 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3058 do_recovery(rec, mem_ctx, pnn, nodemap,
3067 /* there better be the same number of lmasters in the vnn map
3068 as there are active nodes or we will have to do a recovery
3070 if (vnnmap->size != rec->num_active) {
3071 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3072 vnnmap->size, rec->num_active));
3073 ctdb_set_culprit(rec, ctdb->pnn);
3074 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3078 /* verify that all active nodes in the nodemap also exist in
3081 for (j=0; j<nodemap->num; j++) {
3082 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3085 if (nodemap->nodes[j].pnn == pnn) {
3089 for (i=0; i<vnnmap->size; i++) {
3090 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3094 if (i == vnnmap->size) {
3095 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3096 nodemap->nodes[j].pnn));
3097 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3098 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3104 /* verify that all other nodes have the same vnnmap
3105 and are from the same generation
3107 for (j=0; j<nodemap->num; j++) {
3108 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3111 if (nodemap->nodes[j].pnn == pnn) {
3115 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3116 mem_ctx, &remote_vnnmap);
3118 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3119 nodemap->nodes[j].pnn));
3123 /* verify the vnnmap generation is the same */
3124 if (vnnmap->generation != remote_vnnmap->generation) {
3125 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3126 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3127 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3128 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3132 /* verify the vnnmap size is the same */
3133 if (vnnmap->size != remote_vnnmap->size) {
3134 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3135 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3136 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3137 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3141 /* verify the vnnmap is the same */
3142 for (i=0;i<vnnmap->size;i++) {
3143 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3144 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3145 nodemap->nodes[j].pnn));
3146 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3147 do_recovery(rec, mem_ctx, pnn, nodemap,
3154 /* we might need to change who has what IP assigned */
3155 if (rec->need_takeover_run) {
3156 rec->need_takeover_run = false;
3158 /* execute the "startrecovery" event script on all nodes */
3159 ret = run_startrecovery_eventscript(rec, nodemap);
3161 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3162 ctdb_set_culprit(rec, ctdb->pnn);
3163 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3166 ret = ctdb_takeover_run(ctdb, nodemap);
3168 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3169 ctdb_set_culprit(rec, ctdb->pnn);
3170 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3173 /* execute the "recovered" event script on all nodes */
3174 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3176 // we cant check whether the event completed successfully
3177 // since this script WILL fail if the node is in recovery mode
3178 // and if that race happens, the code here would just cause a second
3179 // cascading recovery.
3181 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3182 ctdb_set_culprit(rec, ctdb->pnn);
3183 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3194 event handler for when the main ctdbd dies
3196 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3197 uint16_t flags, void *private_data)
3199 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3204 called regularly to verify that the recovery daemon is still running
3206 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3207 struct timeval yt, void *p)
3209 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3211 if (kill(ctdb->recoverd_pid, 0) != 0) {
3212 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3214 ctdb_stop_recoverd(ctdb);
3215 ctdb_stop_keepalive(ctdb);
3216 ctdb_stop_monitoring(ctdb);
3217 ctdb_release_all_ips(ctdb);
3218 if (ctdb->methods != NULL) {
3219 ctdb->methods->shutdown(ctdb);
3221 ctdb_event_script(ctdb, "shutdown");
3226 event_add_timed(ctdb->ev, ctdb,
3227 timeval_current_ofs(30, 0),
3228 ctdb_check_recd, ctdb);
3231 static void recd_sig_child_handler(struct event_context *ev,
3232 struct signal_event *se, int signum, int count,
3236 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3241 pid = waitpid(-1, &status, WNOHANG);
3243 if (errno != ECHILD) {
3244 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3249 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3255 startup the recovery daemon as a child of the main ctdb daemon
3257 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3260 struct signal_event *se;
3262 if (pipe(fd) != 0) {
3266 ctdb->ctdbd_pid = getpid();
3268 ctdb->recoverd_pid = fork();
3269 if (ctdb->recoverd_pid == -1) {
3273 if (ctdb->recoverd_pid != 0) {
3275 event_add_timed(ctdb->ev, ctdb,
3276 timeval_current_ofs(30, 0),
3277 ctdb_check_recd, ctdb);
3283 srandom(getpid() ^ time(NULL));
3285 if (switch_from_server_to_client(ctdb) != 0) {
3286 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3290 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3291 ctdb_recoverd_parent, &fd[0]);
3293 /* set up a handler to pick up sigchld */
3294 se = event_add_signal(ctdb->ev, ctdb,
3296 recd_sig_child_handler,
3299 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3303 monitor_cluster(ctdb);
3305 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3310 shutdown the recovery daemon
3312 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3314 if (ctdb->recoverd_pid == 0) {
3318 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3319 kill(ctdb->recoverd_pid, SIGTERM);