4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
259 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
261 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
262 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
266 change recovery mode on all nodes
268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
274 tmp_ctx = talloc_new(ctdb);
275 CTDB_NO_MEMORY(ctdb, tmp_ctx);
277 /* freeze all nodes */
278 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
279 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
282 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
288 set_recmode_fail_callback,
290 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
291 talloc_free(tmp_ctx);
298 data.dsize = sizeof(uint32_t);
299 data.dptr = (unsigned char *)&rec_mode;
301 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
308 talloc_free(tmp_ctx);
312 talloc_free(tmp_ctx);
317 change recovery master on all node
319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
325 tmp_ctx = talloc_new(ctdb);
326 CTDB_NO_MEMORY(ctdb, tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&pnn;
331 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
334 CONTROL_TIMEOUT(), false, data,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
346 /* update all remote nodes to use the same db priority that we have
347 this can fail if the remove node has not yet been upgraded to
348 support this function, so we always return success and never fail
349 a recovery if this call fails.
351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
352 struct ctdb_node_map *nodemap,
353 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
358 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
360 /* step through all local databases */
361 for (db=0; db<dbmap->num;db++) {
363 struct ctdb_db_priority db_prio;
366 db_prio.db_id = dbmap->dbs[db].dbid;
367 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
369 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
373 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
375 data.dptr = (uint8_t *)&db_prio;
376 data.dsize = sizeof(db_prio);
378 if (ctdb_client_async_control(ctdb,
379 CTDB_CONTROL_SET_DB_PRIORITY,
381 CONTROL_TIMEOUT(), false, data,
384 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
392 ensure all other nodes have attached to any databases that we have
394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
395 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
398 struct ctdb_dbid_map *remote_dbmap;
400 /* verify that all other nodes have all our databases */
401 for (j=0; j<nodemap->num; j++) {
402 /* we dont need to ourself ourselves */
403 if (nodemap->nodes[j].pnn == pnn) {
406 /* dont check nodes that are unavailable */
407 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
411 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
412 mem_ctx, &remote_dbmap);
414 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
418 /* step through all local databases */
419 for (db=0; db<dbmap->num;db++) {
423 for (i=0;i<remote_dbmap->num;i++) {
424 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
428 /* the remote node already have this database */
429 if (i!=remote_dbmap->num) {
432 /* ok so we need to create this database */
433 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
436 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
439 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, name, dbmap->dbs[db].persistent);
442 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
453 ensure we are attached to any databases that anyone else is attached to
455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
456 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
459 struct ctdb_dbid_map *remote_dbmap;
461 /* verify that we have all database any other node has */
462 for (j=0; j<nodemap->num; j++) {
463 /* we dont need to ourself ourselves */
464 if (nodemap->nodes[j].pnn == pnn) {
467 /* dont check nodes that are unavailable */
468 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
472 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
473 mem_ctx, &remote_dbmap);
475 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
479 /* step through all databases on the remote node */
480 for (db=0; db<remote_dbmap->num;db++) {
483 for (i=0;i<(*dbmap)->num;i++) {
484 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
488 /* we already have this db locally */
489 if (i!=(*dbmap)->num) {
492 /* ok so we need to create this database and
495 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
496 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
498 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
499 nodemap->nodes[j].pnn));
502 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
503 remote_dbmap->dbs[db].persistent);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
521 pull the remote database contents from one node into the recdb
523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
524 struct tdb_wrap *recdb, uint32_t dbid,
529 struct ctdb_marshall_buffer *reply;
530 struct ctdb_rec_data *rec;
532 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
534 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
535 CONTROL_TIMEOUT(), &outdata);
537 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
538 talloc_free(tmp_ctx);
542 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
544 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
545 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
546 talloc_free(tmp_ctx);
550 rec = (struct ctdb_rec_data *)&reply->data[0];
554 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
556 struct ctdb_ltdb_header *hdr;
559 key.dptr = &rec->data[0];
560 key.dsize = rec->keylen;
561 data.dptr = &rec->data[key.dsize];
562 data.dsize = rec->datalen;
564 hdr = (struct ctdb_ltdb_header *)data.dptr;
566 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
567 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
568 talloc_free(tmp_ctx);
572 /* fetch the existing record, if any */
573 existing = tdb_fetch(recdb->tdb, key);
575 if (existing.dptr != NULL) {
576 struct ctdb_ltdb_header header;
577 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
578 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
579 (unsigned)existing.dsize, srcnode));
581 talloc_free(tmp_ctx);
584 header = *(struct ctdb_ltdb_header *)existing.dptr;
586 if (!(header.rsn < hdr->rsn ||
587 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
592 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
593 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
594 talloc_free(tmp_ctx);
599 talloc_free(tmp_ctx);
605 pull all the remote database contents into the recdb
607 static int pull_remote_database(struct ctdb_context *ctdb,
608 struct ctdb_recoverd *rec,
609 struct ctdb_node_map *nodemap,
610 struct tdb_wrap *recdb, uint32_t dbid,
615 /* pull all records from all other nodes across onto this node
616 (this merges based on rsn)
618 for (j=0; j<nodemap->num; j++) {
619 /* dont merge from nodes that are unavailable */
620 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
623 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
624 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
625 nodemap->nodes[j].pnn));
626 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
636 update flags on all active nodes
638 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
642 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
644 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
652 ensure all nodes have the same vnnmap we do
654 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
655 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
659 /* push the new vnn map out to all the nodes */
660 for (j=0; j<nodemap->num; j++) {
661 /* dont push to nodes that are unavailable */
662 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
666 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
668 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
678 struct vacuum_info *next, *prev;
679 struct ctdb_recoverd *rec;
681 struct ctdb_db_context *ctdb_db;
682 struct ctdb_marshall_buffer *recs;
683 struct ctdb_rec_data *r;
686 static void vacuum_fetch_next(struct vacuum_info *v);
689 called when a vacuum fetch has completed - just free it and do the next one
691 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
693 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
695 vacuum_fetch_next(v);
700 process the next element from the vacuum list
702 static void vacuum_fetch_next(struct vacuum_info *v)
704 struct ctdb_call call;
705 struct ctdb_rec_data *r;
707 while (v->recs->count) {
708 struct ctdb_client_call_state *state;
710 struct ctdb_ltdb_header *hdr;
713 call.call_id = CTDB_NULL_FUNC;
714 call.flags = CTDB_IMMEDIATE_MIGRATION;
717 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
720 call.key.dptr = &r->data[0];
721 call.key.dsize = r->keylen;
723 /* ensure we don't block this daemon - just skip a record if we can't get
725 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
729 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
730 if (data.dptr == NULL) {
731 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
735 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
737 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
741 hdr = (struct ctdb_ltdb_header *)data.dptr;
742 if (hdr->dmaster == v->rec->ctdb->pnn) {
743 /* its already local */
745 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
751 state = ctdb_call_send(v->ctdb_db, &call);
752 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
754 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
758 state->async.fn = vacuum_fetch_callback;
759 state->async.private_data = v;
768 destroy a vacuum info structure
770 static int vacuum_info_destructor(struct vacuum_info *v)
772 DLIST_REMOVE(v->rec->vacuum_info, v);
778 handler for vacuum fetch
780 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
781 TDB_DATA data, void *private_data)
783 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
784 struct ctdb_marshall_buffer *recs;
786 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
788 struct ctdb_dbid_map *dbmap=NULL;
789 bool persistent = false;
790 struct ctdb_db_context *ctdb_db;
791 struct ctdb_rec_data *r;
793 struct vacuum_info *v;
795 recs = (struct ctdb_marshall_buffer *)data.dptr;
796 r = (struct ctdb_rec_data *)&recs->data[0];
798 if (recs->count == 0) {
799 talloc_free(tmp_ctx);
805 for (v=rec->vacuum_info;v;v=v->next) {
806 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
807 /* we're already working on records from this node */
808 talloc_free(tmp_ctx);
813 /* work out if the database is persistent */
814 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
816 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
817 talloc_free(tmp_ctx);
821 for (i=0;i<dbmap->num;i++) {
822 if (dbmap->dbs[i].dbid == recs->db_id) {
823 persistent = dbmap->dbs[i].persistent;
827 if (i == dbmap->num) {
828 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
829 talloc_free(tmp_ctx);
833 /* find the name of this database */
834 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
835 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
836 talloc_free(tmp_ctx);
841 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
842 if (ctdb_db == NULL) {
843 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
844 talloc_free(tmp_ctx);
848 v = talloc_zero(rec, struct vacuum_info);
850 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
851 talloc_free(tmp_ctx);
856 v->srcnode = srcnode;
857 v->ctdb_db = ctdb_db;
858 v->recs = talloc_memdup(v, recs, data.dsize);
859 if (v->recs == NULL) {
860 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
862 talloc_free(tmp_ctx);
865 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
867 DLIST_ADD(rec->vacuum_info, v);
869 talloc_set_destructor(v, vacuum_info_destructor);
871 vacuum_fetch_next(v);
872 talloc_free(tmp_ctx);
877 called when ctdb_wait_timeout should finish
879 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
880 struct timeval yt, void *p)
882 uint32_t *timed_out = (uint32_t *)p;
887 wait for a given number of seconds
889 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
891 uint32_t timed_out = 0;
892 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
894 event_loop_once(ctdb->ev);
899 called when an election times out (ends)
901 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
902 struct timeval t, void *p)
904 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
905 rec->election_timeout = NULL;
907 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
912 wait for an election to finish. It finished election_timeout seconds after
913 the last election packet is received
915 static void ctdb_wait_election(struct ctdb_recoverd *rec)
917 struct ctdb_context *ctdb = rec->ctdb;
918 while (rec->election_timeout) {
919 event_loop_once(ctdb->ev);
924 Update our local flags from all remote connected nodes.
925 This is only run when we are or we belive we are the recovery master
927 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
930 struct ctdb_context *ctdb = rec->ctdb;
931 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
933 /* get the nodemap for all active remote nodes and verify
934 they are the same as for this node
936 for (j=0; j<nodemap->num; j++) {
937 struct ctdb_node_map *remote_nodemap=NULL;
940 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
943 if (nodemap->nodes[j].pnn == ctdb->pnn) {
947 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
948 mem_ctx, &remote_nodemap);
950 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
951 nodemap->nodes[j].pnn));
952 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
953 talloc_free(mem_ctx);
954 return MONITOR_FAILED;
956 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
957 /* We should tell our daemon about this so it
958 updates its flags or else we will log the same
959 message again in the next iteration of recovery.
960 Since we are the recovery master we can just as
961 well update the flags on all nodes.
963 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
965 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
969 /* Update our local copy of the flags in the recovery
972 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
973 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
974 nodemap->nodes[j].flags));
975 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
977 talloc_free(remote_nodemap);
979 talloc_free(mem_ctx);
984 /* Create a new random generation ip.
985 The generation id can not be the INVALID_GENERATION id
987 static uint32_t new_generation(void)
992 generation = random();
994 if (generation != INVALID_GENERATION) {
1004 create a temporary working database
1006 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1009 struct tdb_wrap *recdb;
1012 /* open up the temporary recovery database */
1013 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1014 ctdb->db_directory_state,
1021 tdb_flags = TDB_NOLOCK;
1022 if (!ctdb->do_setsched) {
1023 tdb_flags |= TDB_NOMMAP;
1025 tdb_flags |= TDB_DISALLOW_NESTING;
1027 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1028 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1029 if (recdb == NULL) {
1030 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1040 a traverse function for pulling all relevent records from recdb
1043 struct ctdb_context *ctdb;
1044 struct ctdb_marshall_buffer *recdata;
1050 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1052 struct recdb_data *params = (struct recdb_data *)p;
1053 struct ctdb_rec_data *rec;
1054 struct ctdb_ltdb_header *hdr;
1056 /* skip empty records */
1057 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1061 /* update the dmaster field to point to us */
1062 hdr = (struct ctdb_ltdb_header *)data.dptr;
1063 if (!params->persistent) {
1064 hdr->dmaster = params->ctdb->pnn;
1067 /* add the record to the blob ready to send to the nodes */
1068 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1070 params->failed = true;
1073 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1074 if (params->recdata == NULL) {
1075 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1076 rec->length + params->len, params->recdata->count));
1077 params->failed = true;
1080 params->recdata->count++;
1081 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1082 params->len += rec->length;
1089 push the recdb database out to all nodes
1091 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1093 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1095 struct recdb_data params;
1096 struct ctdb_marshall_buffer *recdata;
1098 TALLOC_CTX *tmp_ctx;
1101 tmp_ctx = talloc_new(ctdb);
1102 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1104 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1105 CTDB_NO_MEMORY(ctdb, recdata);
1107 recdata->db_id = dbid;
1110 params.recdata = recdata;
1111 params.len = offsetof(struct ctdb_marshall_buffer, data);
1112 params.failed = false;
1113 params.persistent = persistent;
1115 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1116 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1117 talloc_free(params.recdata);
1118 talloc_free(tmp_ctx);
1122 if (params.failed) {
1123 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1124 talloc_free(params.recdata);
1125 talloc_free(tmp_ctx);
1129 recdata = params.recdata;
1131 outdata.dptr = (void *)recdata;
1132 outdata.dsize = params.len;
1134 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1135 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1137 CONTROL_TIMEOUT(), false, outdata,
1140 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1141 talloc_free(recdata);
1142 talloc_free(tmp_ctx);
1146 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1147 dbid, recdata->count));
1149 talloc_free(recdata);
1150 talloc_free(tmp_ctx);
1157 go through a full recovery on one database
1159 static int recover_database(struct ctdb_recoverd *rec,
1160 TALLOC_CTX *mem_ctx,
1164 struct ctdb_node_map *nodemap,
1165 uint32_t transaction_id)
1167 struct tdb_wrap *recdb;
1169 struct ctdb_context *ctdb = rec->ctdb;
1171 struct ctdb_control_wipe_database w;
1174 recdb = create_recdb(ctdb, mem_ctx);
1175 if (recdb == NULL) {
1179 /* pull all remote databases onto the recdb */
1180 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1182 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1186 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1188 /* wipe all the remote databases. This is safe as we are in a transaction */
1190 w.transaction_id = transaction_id;
1192 data.dptr = (void *)&w;
1193 data.dsize = sizeof(w);
1195 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1198 CONTROL_TIMEOUT(), false, data,
1201 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1206 /* push out the correct database. This sets the dmaster and skips
1207 the empty records */
1208 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1214 /* all done with this database */
1221 reload the nodes file
1223 static void reload_nodes_file(struct ctdb_context *ctdb)
1226 ctdb_load_nodes_file(ctdb);
1231 we are the recmaster, and recovery is needed - start a recovery run
1233 static int do_recovery(struct ctdb_recoverd *rec,
1234 TALLOC_CTX *mem_ctx, uint32_t pnn,
1235 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1237 struct ctdb_context *ctdb = rec->ctdb;
1239 uint32_t generation;
1240 struct ctdb_dbid_map *dbmap;
1243 struct timeval start_time;
1245 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1247 /* if recovery fails, force it again */
1248 rec->need_recovery = true;
1250 for (i=0; i<ctdb->num_nodes; i++) {
1251 struct ctdb_banning_state *ban_state;
1253 if (ctdb->nodes[i]->ban_state == NULL) {
1256 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1257 if (ban_state->count < 2*ctdb->num_nodes) {
1260 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1261 ctdb->nodes[i]->pnn, ban_state->count,
1262 ctdb->tunable.recovery_ban_period));
1263 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1264 ban_state->count = 0;
1268 if (ctdb->tunable.verify_recovery_lock != 0) {
1269 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1270 start_time = timeval_current();
1271 if (!ctdb_recovery_lock(ctdb, true)) {
1272 ctdb_set_culprit(rec, pnn);
1273 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1276 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1277 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1280 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1282 /* get a list of all databases */
1283 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1285 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1289 /* we do the db creation before we set the recovery mode, so the freeze happens
1290 on all databases we will be dealing with. */
1292 /* verify that we have all the databases any other node has */
1293 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1295 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1299 /* verify that all other nodes have all our databases */
1300 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1302 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1305 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1307 /* update the database priority for all remote databases */
1308 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1310 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1312 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1315 /* set recovery mode to active on all nodes */
1316 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1318 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1322 /* execute the "startrecovery" event script on all nodes */
1323 ret = run_startrecovery_eventscript(rec, nodemap);
1325 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1329 /* pick a new generation number */
1330 generation = new_generation();
1332 /* change the vnnmap on this node to use the new generation
1333 number but not on any other nodes.
1334 this guarantees that if we abort the recovery prematurely
1335 for some reason (a node stops responding?)
1336 that we can just return immediately and we will reenter
1337 recovery shortly again.
1338 I.e. we deliberately leave the cluster with an inconsistent
1339 generation id to allow us to abort recovery at any stage and
1340 just restart it from scratch.
1342 vnnmap->generation = generation;
1343 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1345 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1349 data.dptr = (void *)&generation;
1350 data.dsize = sizeof(uint32_t);
1352 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1353 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1355 CONTROL_TIMEOUT(), false, data,
1357 transaction_start_fail_callback,
1359 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1360 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1362 CONTROL_TIMEOUT(), false, tdb_null,
1366 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1371 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1373 for (i=0;i<dbmap->num;i++) {
1374 ret = recover_database(rec, mem_ctx,
1376 dbmap->dbs[i].persistent,
1377 pnn, nodemap, generation);
1379 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1384 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1386 /* commit all the changes */
1387 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1389 CONTROL_TIMEOUT(), false, data,
1392 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1396 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1399 /* update the capabilities for all nodes */
1400 ret = update_capabilities(ctdb, nodemap);
1402 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1406 /* build a new vnn map with all the currently active and
1408 generation = new_generation();
1409 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1410 CTDB_NO_MEMORY(ctdb, vnnmap);
1411 vnnmap->generation = generation;
1413 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1414 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1415 for (i=j=0;i<nodemap->num;i++) {
1416 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1419 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1420 /* this node can not be an lmaster */
1421 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1426 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1427 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1428 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1431 if (vnnmap->size == 0) {
1432 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1434 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1435 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1436 vnnmap->map[0] = pnn;
1439 /* update to the new vnnmap on all nodes */
1440 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1442 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1446 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1448 /* update recmaster to point to us for all nodes */
1449 ret = set_recovery_master(ctdb, nodemap, pnn);
1451 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1455 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1458 update all nodes to have the same flags that we have
1460 for (i=0;i<nodemap->num;i++) {
1461 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1465 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1467 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1472 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1474 /* disable recovery mode */
1475 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1477 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1481 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1484 tell nodes to takeover their public IPs
1486 rec->need_takeover_run = false;
1487 ret = ctdb_takeover_run(ctdb, nodemap);
1489 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1492 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1494 /* execute the "recovered" event script on all nodes */
1495 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1497 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1501 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1503 /* send a message to all clients telling them that the cluster
1504 has been reconfigured */
1505 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1507 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1509 rec->need_recovery = false;
1511 /* we managed to complete a full recovery, make sure to forgive
1512 any past sins by the nodes that could now participate in the
1515 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1516 for (i=0;i<nodemap->num;i++) {
1517 struct ctdb_banning_state *ban_state;
1519 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1523 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1524 if (ban_state == NULL) {
1528 ban_state->count = 0;
1532 /* We just finished a recovery successfully.
1533 We now wait for rerecovery_timeout before we allow
1534 another recovery to take place.
1536 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1537 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1538 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1545 elections are won by first checking the number of connected nodes, then
1546 the priority time, then the pnn
1548 struct election_message {
1549 uint32_t num_connected;
1550 struct timeval priority_time;
1552 uint32_t node_flags;
1556 form this nodes election data
1558 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1561 struct ctdb_node_map *nodemap;
1562 struct ctdb_context *ctdb = rec->ctdb;
1566 em->pnn = rec->ctdb->pnn;
1567 em->priority_time = rec->priority_time;
1569 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1571 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1575 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1576 em->node_flags = rec->node_flags;
1578 for (i=0;i<nodemap->num;i++) {
1579 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1580 em->num_connected++;
1584 /* we shouldnt try to win this election if we cant be a recmaster */
1585 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1586 em->num_connected = 0;
1587 em->priority_time = timeval_current();
1590 talloc_free(nodemap);
1594 see if the given election data wins
1596 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1598 struct election_message myem;
1601 ctdb_election_data(rec, &myem);
1603 /* we cant win if we dont have the recmaster capability */
1604 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1608 /* we cant win if we are banned */
1609 if (rec->node_flags & NODE_FLAGS_BANNED) {
1613 /* we cant win if we are stopped */
1614 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1618 /* we will automatically win if the other node is banned */
1619 if (em->node_flags & NODE_FLAGS_BANNED) {
1623 /* we will automatically win if the other node is banned */
1624 if (em->node_flags & NODE_FLAGS_STOPPED) {
1628 /* try to use the most connected node */
1630 cmp = (int)myem.num_connected - (int)em->num_connected;
1633 /* then the longest running node */
1635 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1639 cmp = (int)myem.pnn - (int)em->pnn;
1646 send out an election request
1648 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1651 TDB_DATA election_data;
1652 struct election_message emsg;
1654 struct ctdb_context *ctdb = rec->ctdb;
1656 srvid = CTDB_SRVID_RECOVERY;
1658 ctdb_election_data(rec, &emsg);
1660 election_data.dsize = sizeof(struct election_message);
1661 election_data.dptr = (unsigned char *)&emsg;
1664 /* send an election message to all active nodes */
1665 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1666 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1669 /* A new node that is already frozen has entered the cluster.
1670 The existing nodes are not frozen and dont need to be frozen
1671 until the election has ended and we start the actual recovery
1673 if (update_recmaster == true) {
1674 /* first we assume we will win the election and set
1675 recoverymaster to be ourself on the current node
1677 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1679 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1689 this function will unban all nodes in the cluster
1691 static void unban_all_nodes(struct ctdb_context *ctdb)
1694 struct ctdb_node_map *nodemap;
1695 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1697 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1699 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1703 for (i=0;i<nodemap->num;i++) {
1704 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1705 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1706 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1710 talloc_free(tmp_ctx);
1715 we think we are winning the election - send a broadcast election request
1717 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1719 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1722 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1724 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1727 talloc_free(rec->send_election_te);
1728 rec->send_election_te = NULL;
1732 handler for memory dumps
1734 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1735 TDB_DATA data, void *private_data)
1737 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1740 struct rd_memdump_reply *rd;
1742 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1743 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1744 talloc_free(tmp_ctx);
1747 rd = (struct rd_memdump_reply *)data.dptr;
1749 dump = talloc_zero(tmp_ctx, TDB_DATA);
1751 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1752 talloc_free(tmp_ctx);
1755 ret = ctdb_dump_memory(ctdb, dump);
1757 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1758 talloc_free(tmp_ctx);
1762 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1764 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1766 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1767 talloc_free(tmp_ctx);
1771 talloc_free(tmp_ctx);
1775 handler for reload_nodes
1777 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1778 TDB_DATA data, void *private_data)
1780 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1782 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1784 reload_nodes_file(rec->ctdb);
1788 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1789 struct timeval yt, void *p)
1791 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1793 talloc_free(rec->ip_check_disable_ctx);
1794 rec->ip_check_disable_ctx = NULL;
1797 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1798 TDB_DATA data, void *private_data)
1800 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1803 if (rec->ip_check_disable_ctx != NULL) {
1804 talloc_free(rec->ip_check_disable_ctx);
1805 rec->ip_check_disable_ctx = NULL;
1808 if (data.dsize != sizeof(uint32_t)) {
1809 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1810 "expexting %lu\n", (long unsigned)data.dsize,
1811 (long unsigned)sizeof(uint32_t)));
1814 if (data.dptr == NULL) {
1815 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1819 timeout = *((uint32_t *)data.dptr);
1820 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1822 rec->ip_check_disable_ctx = talloc_new(rec);
1823 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1825 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1830 handler for ip reallocate, just add it to the list of callers and
1831 handle this later in the monitor_cluster loop so we do not recurse
1832 with other callers to takeover_run()
1834 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1835 TDB_DATA data, void *private_data)
1837 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1838 struct ip_reallocate_list *caller;
1840 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1841 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1845 if (rec->ip_reallocate_ctx == NULL) {
1846 rec->ip_reallocate_ctx = talloc_new(rec);
1847 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
1850 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1851 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1853 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1854 caller->next = rec->reallocate_callers;
1855 rec->reallocate_callers = caller;
1860 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1862 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1865 struct ip_reallocate_list *callers;
1867 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1868 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1869 result.dsize = sizeof(int32_t);
1870 result.dptr = (uint8_t *)&ret;
1872 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1874 /* Someone that sent srvid==0 does not want a reply */
1875 if (callers->rd->srvid == 0) {
1878 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
1879 "%u:%llu\n", (unsigned)callers->rd->pnn,
1880 (unsigned long long)callers->rd->srvid));
1881 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1883 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
1884 "message to %u:%llu\n",
1885 (unsigned)callers->rd->pnn,
1886 (unsigned long long)callers->rd->srvid));
1890 talloc_free(tmp_ctx);
1891 talloc_free(rec->ip_reallocate_ctx);
1892 rec->ip_reallocate_ctx = NULL;
1893 rec->reallocate_callers = NULL;
1899 handler for recovery master elections
1901 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1902 TDB_DATA data, void *private_data)
1904 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1906 struct election_message *em = (struct election_message *)data.dptr;
1907 TALLOC_CTX *mem_ctx;
1909 /* we got an election packet - update the timeout for the election */
1910 talloc_free(rec->election_timeout);
1911 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1912 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1913 ctdb_election_timeout, rec);
1915 mem_ctx = talloc_new(ctdb);
1917 /* someone called an election. check their election data
1918 and if we disagree and we would rather be the elected node,
1919 send a new election message to all other nodes
1921 if (ctdb_election_win(rec, em)) {
1922 if (!rec->send_election_te) {
1923 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1924 timeval_current_ofs(0, 500000),
1925 election_send_request, rec);
1927 talloc_free(mem_ctx);
1928 /*unban_all_nodes(ctdb);*/
1933 talloc_free(rec->send_election_te);
1934 rec->send_election_te = NULL;
1936 if (ctdb->tunable.verify_recovery_lock != 0) {
1937 /* release the recmaster lock */
1938 if (em->pnn != ctdb->pnn &&
1939 ctdb->recovery_lock_fd != -1) {
1940 close(ctdb->recovery_lock_fd);
1941 ctdb->recovery_lock_fd = -1;
1942 unban_all_nodes(ctdb);
1946 /* ok, let that guy become recmaster then */
1947 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1949 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1950 talloc_free(mem_ctx);
1954 talloc_free(mem_ctx);
1960 force the start of the election process
1962 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1963 struct ctdb_node_map *nodemap)
1966 struct ctdb_context *ctdb = rec->ctdb;
1968 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1970 /* set all nodes to recovery mode to stop all internode traffic */
1971 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1973 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1977 talloc_free(rec->election_timeout);
1978 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1979 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1980 ctdb_election_timeout, rec);
1982 ret = send_election_request(rec, pnn, true);
1984 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1988 /* wait for a few seconds to collect all responses */
1989 ctdb_wait_election(rec);
1995 handler for when a node changes its flags
1997 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1998 TDB_DATA data, void *private_data)
2001 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2002 struct ctdb_node_map *nodemap=NULL;
2003 TALLOC_CTX *tmp_ctx;
2004 uint32_t changed_flags;
2006 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2007 int disabled_flag_changed;
2009 if (data.dsize != sizeof(*c)) {
2010 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2014 tmp_ctx = talloc_new(ctdb);
2015 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2017 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2019 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2020 talloc_free(tmp_ctx);
2025 for (i=0;i<nodemap->num;i++) {
2026 if (nodemap->nodes[i].pnn == c->pnn) break;
2029 if (i == nodemap->num) {
2030 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2031 talloc_free(tmp_ctx);
2035 changed_flags = c->old_flags ^ c->new_flags;
2037 if (nodemap->nodes[i].flags != c->new_flags) {
2038 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2041 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2043 nodemap->nodes[i].flags = c->new_flags;
2045 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2046 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2049 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2050 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2054 ctdb->recovery_master == ctdb->pnn &&
2055 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2056 /* Only do the takeover run if the perm disabled or unhealthy
2057 flags changed since these will cause an ip failover but not
2059 If the node became disconnected or banned this will also
2060 lead to an ip address failover but that is handled
2063 if (disabled_flag_changed) {
2064 rec->need_takeover_run = true;
2068 talloc_free(tmp_ctx);
2072 handler for when we need to push out flag changes ot all other nodes
2074 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2075 TDB_DATA data, void *private_data)
2078 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2079 struct ctdb_node_map *nodemap=NULL;
2080 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2084 /* find the recovery master */
2085 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2087 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2088 talloc_free(tmp_ctx);
2092 /* read the node flags from the recmaster */
2093 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2095 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2096 talloc_free(tmp_ctx);
2099 if (c->pnn >= nodemap->num) {
2100 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2101 talloc_free(tmp_ctx);
2105 /* send the flags update to all connected nodes */
2106 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2108 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2109 nodes, 0, CONTROL_TIMEOUT(),
2113 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2115 talloc_free(tmp_ctx);
2119 talloc_free(tmp_ctx);
2123 struct verify_recmode_normal_data {
2125 enum monitor_result status;
2128 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2130 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2133 /* one more node has responded with recmode data*/
2136 /* if we failed to get the recmode, then return an error and let
2137 the main loop try again.
2139 if (state->state != CTDB_CONTROL_DONE) {
2140 if (rmdata->status == MONITOR_OK) {
2141 rmdata->status = MONITOR_FAILED;
2146 /* if we got a response, then the recmode will be stored in the
2149 if (state->status != CTDB_RECOVERY_NORMAL) {
2150 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2151 rmdata->status = MONITOR_RECOVERY_NEEDED;
2158 /* verify that all nodes are in normal recovery mode */
2159 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2161 struct verify_recmode_normal_data *rmdata;
2162 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2163 struct ctdb_client_control_state *state;
2164 enum monitor_result status;
2167 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2168 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2170 rmdata->status = MONITOR_OK;
2172 /* loop over all active nodes and send an async getrecmode call to
2174 for (j=0; j<nodemap->num; j++) {
2175 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2178 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2180 nodemap->nodes[j].pnn);
2181 if (state == NULL) {
2182 /* we failed to send the control, treat this as
2183 an error and try again next iteration
2185 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2186 talloc_free(mem_ctx);
2187 return MONITOR_FAILED;
2190 /* set up the callback functions */
2191 state->async.fn = verify_recmode_normal_callback;
2192 state->async.private_data = rmdata;
2194 /* one more control to wait for to complete */
2199 /* now wait for up to the maximum number of seconds allowed
2200 or until all nodes we expect a response from has replied
2202 while (rmdata->count > 0) {
2203 event_loop_once(ctdb->ev);
2206 status = rmdata->status;
2207 talloc_free(mem_ctx);
2212 struct verify_recmaster_data {
2213 struct ctdb_recoverd *rec;
2216 enum monitor_result status;
2219 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2221 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2224 /* one more node has responded with recmaster data*/
2227 /* if we failed to get the recmaster, then return an error and let
2228 the main loop try again.
2230 if (state->state != CTDB_CONTROL_DONE) {
2231 if (rmdata->status == MONITOR_OK) {
2232 rmdata->status = MONITOR_FAILED;
2237 /* if we got a response, then the recmaster will be stored in the
2240 if (state->status != rmdata->pnn) {
2241 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2242 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2243 rmdata->status = MONITOR_ELECTION_NEEDED;
2250 /* verify that all nodes agree that we are the recmaster */
2251 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2253 struct ctdb_context *ctdb = rec->ctdb;
2254 struct verify_recmaster_data *rmdata;
2255 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2256 struct ctdb_client_control_state *state;
2257 enum monitor_result status;
2260 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2261 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2265 rmdata->status = MONITOR_OK;
2267 /* loop over all active nodes and send an async getrecmaster call to
2269 for (j=0; j<nodemap->num; j++) {
2270 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2273 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2275 nodemap->nodes[j].pnn);
2276 if (state == NULL) {
2277 /* we failed to send the control, treat this as
2278 an error and try again next iteration
2280 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2281 talloc_free(mem_ctx);
2282 return MONITOR_FAILED;
2285 /* set up the callback functions */
2286 state->async.fn = verify_recmaster_callback;
2287 state->async.private_data = rmdata;
2289 /* one more control to wait for to complete */
2294 /* now wait for up to the maximum number of seconds allowed
2295 or until all nodes we expect a response from has replied
2297 while (rmdata->count > 0) {
2298 event_loop_once(ctdb->ev);
2301 status = rmdata->status;
2302 talloc_free(mem_ctx);
2307 /* called to check that the allocation of public ip addresses is ok.
2309 static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
2311 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2312 struct ctdb_all_public_ips *ips = NULL;
2313 struct ctdb_uptime *uptime1 = NULL;
2314 struct ctdb_uptime *uptime2 = NULL;
2317 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2318 CTDB_CURRENT_NODE, &uptime1);
2320 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2321 talloc_free(mem_ctx);
2325 /* read the ip allocation from the local node */
2326 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2328 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2329 talloc_free(mem_ctx);
2333 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2334 CTDB_CURRENT_NODE, &uptime2);
2336 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2337 talloc_free(mem_ctx);
2341 /* skip the check if the startrecovery time has changed */
2342 if (timeval_compare(&uptime1->last_recovery_started,
2343 &uptime2->last_recovery_started) != 0) {
2344 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2345 talloc_free(mem_ctx);
2349 /* skip the check if the endrecovery time has changed */
2350 if (timeval_compare(&uptime1->last_recovery_finished,
2351 &uptime2->last_recovery_finished) != 0) {
2352 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2353 talloc_free(mem_ctx);
2357 /* skip the check if we have started but not finished recovery */
2358 if (timeval_compare(&uptime1->last_recovery_finished,
2359 &uptime1->last_recovery_started) != 1) {
2360 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2361 talloc_free(mem_ctx);
2366 /* verify that we have the ip addresses we should have
2367 and we dont have ones we shouldnt have.
2368 if we find an inconsistency we set recmode to
2369 active on the local node and wait for the recmaster
2370 to do a full blown recovery
2372 for (j=0; j<ips->num; j++) {
2373 if (ips->ips[j].pnn == pnn) {
2374 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2375 struct takeover_run_reply rd;
2378 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2379 ctdb_addr_to_str(&ips->ips[j].addr)));
2383 data.dptr = (uint8_t *)&rd;
2384 data.dsize = sizeof(rd);
2386 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2388 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2392 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2393 struct takeover_run_reply rd;
2396 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2397 ctdb_addr_to_str(&ips->ips[j].addr)));
2401 data.dptr = (uint8_t *)&rd;
2402 data.dsize = sizeof(rd);
2404 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2406 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2412 talloc_free(mem_ctx);
2417 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2419 struct ctdb_node_map **remote_nodemaps = callback_data;
2421 if (node_pnn >= ctdb->num_nodes) {
2422 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2426 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2430 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2431 struct ctdb_node_map *nodemap,
2432 struct ctdb_node_map **remote_nodemaps)
2436 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2437 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2439 CONTROL_TIMEOUT(), false, tdb_null,
2440 async_getnodemap_callback,
2442 remote_nodemaps) != 0) {
2443 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2451 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2452 struct ctdb_check_reclock_state {
2453 struct ctdb_context *ctdb;
2454 struct timeval start_time;
2457 struct timed_event *te;
2458 struct fd_event *fde;
2459 enum reclock_child_status status;
2462 /* when we free the reclock state we must kill any child process.
2464 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2466 struct ctdb_context *ctdb = state->ctdb;
2468 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2470 if (state->fd[0] != -1) {
2471 close(state->fd[0]);
2474 if (state->fd[1] != -1) {
2475 close(state->fd[1]);
2478 kill(state->child, SIGKILL);
2483 called if our check_reclock child times out. this would happen if
2484 i/o to the reclock file blocks.
2486 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2487 struct timeval t, void *private_data)
2489 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2490 struct ctdb_check_reclock_state);
2492 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2493 state->status = RECLOCK_TIMEOUT;
2496 /* this is called when the child process has completed checking the reclock
2497 file and has written data back to us through the pipe.
2499 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2500 uint16_t flags, void *private_data)
2502 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2503 struct ctdb_check_reclock_state);
2507 /* we got a response from our child process so we can abort the
2510 talloc_free(state->te);
2513 ret = read(state->fd[0], &c, 1);
2514 if (ret != 1 || c != RECLOCK_OK) {
2515 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2516 state->status = RECLOCK_FAILED;
2521 state->status = RECLOCK_OK;
2525 static int check_recovery_lock(struct ctdb_context *ctdb)
2528 struct ctdb_check_reclock_state *state;
2529 pid_t parent = getpid();
2531 if (ctdb->recovery_lock_fd == -1) {
2532 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2536 state = talloc(ctdb, struct ctdb_check_reclock_state);
2537 CTDB_NO_MEMORY(ctdb, state);
2540 state->start_time = timeval_current();
2541 state->status = RECLOCK_CHECKING;
2545 ret = pipe(state->fd);
2548 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2552 state->child = fork();
2553 if (state->child == (pid_t)-1) {
2554 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2555 close(state->fd[0]);
2557 close(state->fd[1]);
2563 if (state->child == 0) {
2564 char cc = RECLOCK_OK;
2565 close(state->fd[0]);
2568 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2569 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2570 cc = RECLOCK_FAILED;
2573 write(state->fd[1], &cc, 1);
2574 /* make sure we die when our parent dies */
2575 while (kill(parent, 0) == 0 || errno != ESRCH) {
2577 write(state->fd[1], &cc, 1);
2581 close(state->fd[1]);
2583 set_close_on_exec(state->fd[0]);
2585 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2587 talloc_set_destructor(state, check_reclock_destructor);
2589 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2590 ctdb_check_reclock_timeout, state);
2591 if (state->te == NULL) {
2592 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2597 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2598 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2599 reclock_child_handler,
2602 if (state->fde == NULL) {
2603 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2608 while (state->status == RECLOCK_CHECKING) {
2609 event_loop_once(ctdb->ev);
2612 if (state->status == RECLOCK_FAILED) {
2613 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2614 close(ctdb->recovery_lock_fd);
2615 ctdb->recovery_lock_fd = -1;
2624 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2626 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2627 const char *reclockfile;
2629 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2630 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2631 talloc_free(tmp_ctx);
2635 if (reclockfile == NULL) {
2636 if (ctdb->recovery_lock_file != NULL) {
2637 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2638 talloc_free(ctdb->recovery_lock_file);
2639 ctdb->recovery_lock_file = NULL;
2640 if (ctdb->recovery_lock_fd != -1) {
2641 close(ctdb->recovery_lock_fd);
2642 ctdb->recovery_lock_fd = -1;
2645 ctdb->tunable.verify_recovery_lock = 0;
2646 talloc_free(tmp_ctx);
2650 if (ctdb->recovery_lock_file == NULL) {
2651 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2652 if (ctdb->recovery_lock_fd != -1) {
2653 close(ctdb->recovery_lock_fd);
2654 ctdb->recovery_lock_fd = -1;
2656 talloc_free(tmp_ctx);
2661 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2662 talloc_free(tmp_ctx);
2666 talloc_free(ctdb->recovery_lock_file);
2667 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2668 ctdb->tunable.verify_recovery_lock = 0;
2669 if (ctdb->recovery_lock_fd != -1) {
2670 close(ctdb->recovery_lock_fd);
2671 ctdb->recovery_lock_fd = -1;
2674 talloc_free(tmp_ctx);
2679 the main monitoring loop
2681 static void monitor_cluster(struct ctdb_context *ctdb)
2684 TALLOC_CTX *mem_ctx=NULL;
2685 struct ctdb_node_map *nodemap=NULL;
2686 struct ctdb_node_map *recmaster_nodemap=NULL;
2687 struct ctdb_node_map **remote_nodemaps=NULL;
2688 struct ctdb_vnn_map *vnnmap=NULL;
2689 struct ctdb_vnn_map *remote_vnnmap=NULL;
2690 int32_t debug_level;
2692 struct ctdb_recoverd *rec;
2694 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2696 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2697 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2701 rec->priority_time = timeval_current();
2703 /* register a message port for sending memory dumps */
2704 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2706 /* register a message port for recovery elections */
2707 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2709 /* when nodes are disabled/enabled */
2710 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2712 /* when we are asked to puch out a flag change */
2713 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2715 /* register a message port for vacuum fetch */
2716 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2718 /* register a message port for reloadnodes */
2719 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2721 /* register a message port for performing a takeover run */
2722 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2724 /* register a message port for disabling the ip check for a short while */
2725 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2729 talloc_free(mem_ctx);
2732 mem_ctx = talloc_new(ctdb);
2734 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2738 /* we only check for recovery once every second */
2739 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2741 /* verify that the main daemon is still running */
2742 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2743 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2747 /* ping the local daemon to tell it we are alive */
2748 ctdb_ctrl_recd_ping(ctdb);
2750 if (rec->election_timeout) {
2751 /* an election is in progress */
2755 /* read the debug level from the parent and update locally */
2756 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2758 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2761 LogLevel = debug_level;
2764 /* We must check if we need to ban a node here but we want to do this
2765 as early as possible so we dont wait until we have pulled the node
2766 map from the local node. thats why we have the hardcoded value 20
2768 for (i=0; i<ctdb->num_nodes; i++) {
2769 struct ctdb_banning_state *ban_state;
2771 if (ctdb->nodes[i]->ban_state == NULL) {
2774 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2775 if (ban_state->count < 20) {
2778 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2779 ctdb->nodes[i]->pnn, ban_state->count,
2780 ctdb->tunable.recovery_ban_period));
2781 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2782 ban_state->count = 0;
2785 /* get relevant tunables */
2786 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2788 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2792 /* get the current recovery lock file from the server */
2793 if (update_recovery_lock_file(ctdb) != 0) {
2794 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2798 /* Make sure that if recovery lock verification becomes disabled when
2801 if (ctdb->tunable.verify_recovery_lock == 0) {
2802 if (ctdb->recovery_lock_fd != -1) {
2803 close(ctdb->recovery_lock_fd);
2804 ctdb->recovery_lock_fd = -1;
2808 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2809 if (pnn == (uint32_t)-1) {
2810 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2814 /* get the vnnmap */
2815 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2817 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2822 /* get number of nodes */
2824 talloc_free(rec->nodemap);
2825 rec->nodemap = NULL;
2828 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2830 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2833 nodemap = rec->nodemap;
2835 /* check which node is the recovery master */
2836 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2838 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2842 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2843 if (rec->recmaster != pnn) {
2844 if (rec->ip_reallocate_ctx != NULL) {
2845 talloc_free(rec->ip_reallocate_ctx);
2846 rec->ip_reallocate_ctx = NULL;
2847 rec->reallocate_callers = NULL;
2850 /* if there are takeovers requested, perform it and notify the waiters */
2851 if (rec->reallocate_callers) {
2852 process_ipreallocate_requests(ctdb, rec);
2855 if (rec->recmaster == (uint32_t)-1) {
2856 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2857 force_election(rec, pnn, nodemap);
2862 /* if the local daemon is STOPPED, we verify that the databases are
2863 also frozen and thet the recmode is set to active
2865 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2866 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2868 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2870 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2871 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2873 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2875 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2878 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2880 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2887 /* If the local node is stopped, verify we are not the recmaster
2888 and yield this role if so
2890 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2891 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2892 force_election(rec, pnn, nodemap);
2896 /* check that we (recovery daemon) and the local ctdb daemon
2897 agrees on whether we are banned or not
2901 /* remember our own node flags */
2902 rec->node_flags = nodemap->nodes[pnn].flags;
2904 /* count how many active nodes there are */
2905 rec->num_active = 0;
2906 rec->num_connected = 0;
2907 for (i=0; i<nodemap->num; i++) {
2908 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2911 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2912 rec->num_connected++;
2917 /* verify that the recmaster node is still active */
2918 for (j=0; j<nodemap->num; j++) {
2919 if (nodemap->nodes[j].pnn==rec->recmaster) {
2924 if (j == nodemap->num) {
2925 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2926 force_election(rec, pnn, nodemap);
2930 /* if recovery master is disconnected we must elect a new recmaster */
2931 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2932 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2933 force_election(rec, pnn, nodemap);
2937 /* grap the nodemap from the recovery master to check if it is banned */
2938 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2939 mem_ctx, &recmaster_nodemap);
2941 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2942 nodemap->nodes[j].pnn));
2947 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2948 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2949 force_election(rec, pnn, nodemap);
2954 /* verify that we have all ip addresses we should have and we dont
2955 * have addresses we shouldnt have.
2957 if (ctdb->do_checkpublicip) {
2958 if (rec->ip_check_disable_ctx == NULL) {
2959 if (verify_ip_allocation(ctdb, rec, pnn) != 0) {
2960 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2966 /* if we are not the recmaster then we do not need to check
2967 if recovery is needed
2969 if (pnn != rec->recmaster) {
2974 /* ensure our local copies of flags are right */
2975 ret = update_local_flags(rec, nodemap);
2976 if (ret == MONITOR_ELECTION_NEEDED) {
2977 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2978 force_election(rec, pnn, nodemap);
2981 if (ret != MONITOR_OK) {
2982 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2986 /* update the list of public ips that a node can handle for
2989 if (ctdb->num_nodes != nodemap->num) {
2990 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2991 reload_nodes_file(ctdb);
2994 for (j=0; j<nodemap->num; j++) {
2995 /* release any existing data */
2996 if (ctdb->nodes[j]->public_ips) {
2997 talloc_free(ctdb->nodes[j]->public_ips);
2998 ctdb->nodes[j]->public_ips = NULL;
3001 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3005 /* grab a new shiny list of public ips from the node */
3006 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
3007 ctdb->nodes[j]->pnn,
3009 &ctdb->nodes[j]->public_ips)) {
3010 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
3011 ctdb->nodes[j]->pnn));
3017 /* verify that all active nodes agree that we are the recmaster */
3018 switch (verify_recmaster(rec, nodemap, pnn)) {
3019 case MONITOR_RECOVERY_NEEDED:
3020 /* can not happen */
3022 case MONITOR_ELECTION_NEEDED:
3023 force_election(rec, pnn, nodemap);
3027 case MONITOR_FAILED:
3032 if (rec->need_recovery) {
3033 /* a previous recovery didn't finish */
3034 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3038 /* verify that all active nodes are in normal mode
3039 and not in recovery mode
3041 switch (verify_recmode(ctdb, nodemap)) {
3042 case MONITOR_RECOVERY_NEEDED:
3043 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3045 case MONITOR_FAILED:
3047 case MONITOR_ELECTION_NEEDED:
3048 /* can not happen */
3054 if (ctdb->tunable.verify_recovery_lock != 0) {
3055 /* we should have the reclock - check its not stale */
3056 ret = check_recovery_lock(ctdb);
3058 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3059 ctdb_set_culprit(rec, ctdb->pnn);
3060 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3065 /* get the nodemap for all active remote nodes
3067 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3068 if (remote_nodemaps == NULL) {
3069 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3072 for(i=0; i<nodemap->num; i++) {
3073 remote_nodemaps[i] = NULL;
3075 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3076 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3080 /* verify that all other nodes have the same nodemap as we have
3082 for (j=0; j<nodemap->num; j++) {
3083 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3087 if (remote_nodemaps[j] == NULL) {
3088 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3089 ctdb_set_culprit(rec, j);
3094 /* if the nodes disagree on how many nodes there are
3095 then this is a good reason to try recovery
3097 if (remote_nodemaps[j]->num != nodemap->num) {
3098 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3099 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3100 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3101 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3105 /* if the nodes disagree on which nodes exist and are
3106 active, then that is also a good reason to do recovery
3108 for (i=0;i<nodemap->num;i++) {
3109 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3110 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3111 nodemap->nodes[j].pnn, i,
3112 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3113 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3114 do_recovery(rec, mem_ctx, pnn, nodemap,
3120 /* verify the flags are consistent
3122 for (i=0; i<nodemap->num; i++) {
3123 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3127 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3128 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3129 nodemap->nodes[j].pnn,
3130 nodemap->nodes[i].pnn,
3131 remote_nodemaps[j]->nodes[i].flags,
3132 nodemap->nodes[j].flags));
3134 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3135 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3136 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3137 do_recovery(rec, mem_ctx, pnn, nodemap,
3141 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3142 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3143 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3144 do_recovery(rec, mem_ctx, pnn, nodemap,
3153 /* there better be the same number of lmasters in the vnn map
3154 as there are active nodes or we will have to do a recovery
3156 if (vnnmap->size != rec->num_active) {
3157 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3158 vnnmap->size, rec->num_active));
3159 ctdb_set_culprit(rec, ctdb->pnn);
3160 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3164 /* verify that all active nodes in the nodemap also exist in
3167 for (j=0; j<nodemap->num; j++) {
3168 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3171 if (nodemap->nodes[j].pnn == pnn) {
3175 for (i=0; i<vnnmap->size; i++) {
3176 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3180 if (i == vnnmap->size) {
3181 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3182 nodemap->nodes[j].pnn));
3183 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3184 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3190 /* verify that all other nodes have the same vnnmap
3191 and are from the same generation
3193 for (j=0; j<nodemap->num; j++) {
3194 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3197 if (nodemap->nodes[j].pnn == pnn) {
3201 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3202 mem_ctx, &remote_vnnmap);
3204 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3205 nodemap->nodes[j].pnn));
3209 /* verify the vnnmap generation is the same */
3210 if (vnnmap->generation != remote_vnnmap->generation) {
3211 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3212 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3213 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3214 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3218 /* verify the vnnmap size is the same */
3219 if (vnnmap->size != remote_vnnmap->size) {
3220 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3221 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3222 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3223 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3227 /* verify the vnnmap is the same */
3228 for (i=0;i<vnnmap->size;i++) {
3229 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3230 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3231 nodemap->nodes[j].pnn));
3232 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3233 do_recovery(rec, mem_ctx, pnn, nodemap,
3240 /* we might need to change who has what IP assigned */
3241 if (rec->need_takeover_run) {
3242 rec->need_takeover_run = false;
3244 /* execute the "startrecovery" event script on all nodes */
3245 ret = run_startrecovery_eventscript(rec, nodemap);
3247 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3248 ctdb_set_culprit(rec, ctdb->pnn);
3249 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3252 ret = ctdb_takeover_run(ctdb, nodemap);
3254 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3255 ctdb_set_culprit(rec, ctdb->pnn);
3256 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3259 /* execute the "recovered" event script on all nodes */
3260 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3262 // we cant check whether the event completed successfully
3263 // since this script WILL fail if the node is in recovery mode
3264 // and if that race happens, the code here would just cause a second
3265 // cascading recovery.
3267 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3268 ctdb_set_culprit(rec, ctdb->pnn);
3269 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3280 event handler for when the main ctdbd dies
3282 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3283 uint16_t flags, void *private_data)
3285 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3290 called regularly to verify that the recovery daemon is still running
3292 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3293 struct timeval yt, void *p)
3295 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3297 if (kill(ctdb->recoverd_pid, 0) != 0) {
3298 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3300 ctdb_stop_recoverd(ctdb);
3301 ctdb_stop_keepalive(ctdb);
3302 ctdb_stop_monitoring(ctdb);
3303 ctdb_release_all_ips(ctdb);
3304 if (ctdb->methods != NULL) {
3305 ctdb->methods->shutdown(ctdb);
3307 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
3312 event_add_timed(ctdb->ev, ctdb,
3313 timeval_current_ofs(30, 0),
3314 ctdb_check_recd, ctdb);
3317 static void recd_sig_child_handler(struct event_context *ev,
3318 struct signal_event *se, int signum, int count,
3322 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3327 pid = waitpid(-1, &status, WNOHANG);
3329 if (errno != ECHILD) {
3330 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3335 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3341 startup the recovery daemon as a child of the main ctdb daemon
3343 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3346 struct signal_event *se;
3348 if (pipe(fd) != 0) {
3352 ctdb->ctdbd_pid = getpid();
3354 ctdb->recoverd_pid = fork();
3355 if (ctdb->recoverd_pid == -1) {
3359 if (ctdb->recoverd_pid != 0) {
3361 event_add_timed(ctdb->ev, ctdb,
3362 timeval_current_ofs(30, 0),
3363 ctdb_check_recd, ctdb);
3369 srandom(getpid() ^ time(NULL));
3371 if (switch_from_server_to_client(ctdb) != 0) {
3372 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3376 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3378 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3379 ctdb_recoverd_parent, &fd[0]);
3381 /* set up a handler to pick up sigchld */
3382 se = event_add_signal(ctdb->ev, ctdb,
3384 recd_sig_child_handler,
3387 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3391 monitor_cluster(ctdb);
3393 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3398 shutdown the recovery daemon
3400 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3402 if (ctdb->recoverd_pid == 0) {
3406 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3407 kill(ctdb->recoverd_pid, SIGTERM);