4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
259 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
261 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
262 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
266 change recovery mode on all nodes
268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
274 tmp_ctx = talloc_new(ctdb);
275 CTDB_NO_MEMORY(ctdb, tmp_ctx);
277 /* freeze all nodes */
278 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
279 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
282 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
288 set_recmode_fail_callback,
290 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
291 talloc_free(tmp_ctx);
298 data.dsize = sizeof(uint32_t);
299 data.dptr = (unsigned char *)&rec_mode;
301 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
308 talloc_free(tmp_ctx);
312 talloc_free(tmp_ctx);
317 change recovery master on all node
319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
325 tmp_ctx = talloc_new(ctdb);
326 CTDB_NO_MEMORY(ctdb, tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&pnn;
331 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
334 CONTROL_TIMEOUT(), false, data,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
346 /* update all remote nodes to use the same db priority that we have
347 this can fail if the remove node has not yet been upgraded to
348 support this function, so we always return success and never fail
349 a recovery if this call fails.
351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
352 struct ctdb_node_map *nodemap,
353 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
358 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
360 /* step through all local databases */
361 for (db=0; db<dbmap->num;db++) {
363 struct ctdb_db_priority db_prio;
366 db_prio.db_id = dbmap->dbs[db].dbid;
367 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
369 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
373 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
375 data.dptr = (uint8_t *)&db_prio;
376 data.dsize = sizeof(db_prio);
378 if (ctdb_client_async_control(ctdb,
379 CTDB_CONTROL_SET_DB_PRIORITY,
381 CONTROL_TIMEOUT(), false, data,
384 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
392 ensure all other nodes have attached to any databases that we have
394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
395 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
398 struct ctdb_dbid_map *remote_dbmap;
400 /* verify that all other nodes have all our databases */
401 for (j=0; j<nodemap->num; j++) {
402 /* we dont need to ourself ourselves */
403 if (nodemap->nodes[j].pnn == pnn) {
406 /* dont check nodes that are unavailable */
407 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
411 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
412 mem_ctx, &remote_dbmap);
414 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
418 /* step through all local databases */
419 for (db=0; db<dbmap->num;db++) {
423 for (i=0;i<remote_dbmap->num;i++) {
424 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
428 /* the remote node already have this database */
429 if (i!=remote_dbmap->num) {
432 /* ok so we need to create this database */
433 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
436 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
439 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, name, dbmap->dbs[db].persistent);
442 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
453 ensure we are attached to any databases that anyone else is attached to
455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
456 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
459 struct ctdb_dbid_map *remote_dbmap;
461 /* verify that we have all database any other node has */
462 for (j=0; j<nodemap->num; j++) {
463 /* we dont need to ourself ourselves */
464 if (nodemap->nodes[j].pnn == pnn) {
467 /* dont check nodes that are unavailable */
468 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
472 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
473 mem_ctx, &remote_dbmap);
475 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
479 /* step through all databases on the remote node */
480 for (db=0; db<remote_dbmap->num;db++) {
483 for (i=0;i<(*dbmap)->num;i++) {
484 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
488 /* we already have this db locally */
489 if (i!=(*dbmap)->num) {
492 /* ok so we need to create this database and
495 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
496 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
498 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
499 nodemap->nodes[j].pnn));
502 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
503 remote_dbmap->dbs[db].persistent);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
521 pull the remote database contents from one node into the recdb
523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
524 struct tdb_wrap *recdb, uint32_t dbid,
529 struct ctdb_marshall_buffer *reply;
530 struct ctdb_rec_data *rec;
532 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
534 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
535 CONTROL_TIMEOUT(), &outdata);
537 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
538 talloc_free(tmp_ctx);
542 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
544 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
545 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
546 talloc_free(tmp_ctx);
550 rec = (struct ctdb_rec_data *)&reply->data[0];
554 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
556 struct ctdb_ltdb_header *hdr;
559 key.dptr = &rec->data[0];
560 key.dsize = rec->keylen;
561 data.dptr = &rec->data[key.dsize];
562 data.dsize = rec->datalen;
564 hdr = (struct ctdb_ltdb_header *)data.dptr;
566 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
567 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
568 talloc_free(tmp_ctx);
572 /* fetch the existing record, if any */
573 existing = tdb_fetch(recdb->tdb, key);
575 if (existing.dptr != NULL) {
576 struct ctdb_ltdb_header header;
577 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
578 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
579 (unsigned)existing.dsize, srcnode));
581 talloc_free(tmp_ctx);
584 header = *(struct ctdb_ltdb_header *)existing.dptr;
586 if (!(header.rsn < hdr->rsn ||
587 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
592 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
593 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
594 talloc_free(tmp_ctx);
599 talloc_free(tmp_ctx);
605 pull all the remote database contents into the recdb
607 static int pull_remote_database(struct ctdb_context *ctdb,
608 struct ctdb_recoverd *rec,
609 struct ctdb_node_map *nodemap,
610 struct tdb_wrap *recdb, uint32_t dbid,
615 /* pull all records from all other nodes across onto this node
616 (this merges based on rsn)
618 for (j=0; j<nodemap->num; j++) {
619 /* dont merge from nodes that are unavailable */
620 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
623 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
624 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
625 nodemap->nodes[j].pnn));
626 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
636 update flags on all active nodes
638 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
642 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
644 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
652 ensure all nodes have the same vnnmap we do
654 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
655 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
659 /* push the new vnn map out to all the nodes */
660 for (j=0; j<nodemap->num; j++) {
661 /* dont push to nodes that are unavailable */
662 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
666 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
668 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
678 struct vacuum_info *next, *prev;
679 struct ctdb_recoverd *rec;
681 struct ctdb_db_context *ctdb_db;
682 struct ctdb_marshall_buffer *recs;
683 struct ctdb_rec_data *r;
686 static void vacuum_fetch_next(struct vacuum_info *v);
689 called when a vacuum fetch has completed - just free it and do the next one
691 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
693 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
695 vacuum_fetch_next(v);
700 process the next element from the vacuum list
702 static void vacuum_fetch_next(struct vacuum_info *v)
704 struct ctdb_call call;
705 struct ctdb_rec_data *r;
707 while (v->recs->count) {
708 struct ctdb_client_call_state *state;
710 struct ctdb_ltdb_header *hdr;
713 call.call_id = CTDB_NULL_FUNC;
714 call.flags = CTDB_IMMEDIATE_MIGRATION;
717 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
720 call.key.dptr = &r->data[0];
721 call.key.dsize = r->keylen;
723 /* ensure we don't block this daemon - just skip a record if we can't get
725 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
729 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
730 if (data.dptr == NULL) {
731 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
735 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
737 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
741 hdr = (struct ctdb_ltdb_header *)data.dptr;
742 if (hdr->dmaster == v->rec->ctdb->pnn) {
743 /* its already local */
745 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
751 state = ctdb_call_send(v->ctdb_db, &call);
752 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
754 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
758 state->async.fn = vacuum_fetch_callback;
759 state->async.private_data = v;
768 destroy a vacuum info structure
770 static int vacuum_info_destructor(struct vacuum_info *v)
772 DLIST_REMOVE(v->rec->vacuum_info, v);
778 handler for vacuum fetch
780 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
781 TDB_DATA data, void *private_data)
783 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
784 struct ctdb_marshall_buffer *recs;
786 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
788 struct ctdb_dbid_map *dbmap=NULL;
789 bool persistent = false;
790 struct ctdb_db_context *ctdb_db;
791 struct ctdb_rec_data *r;
793 struct vacuum_info *v;
795 recs = (struct ctdb_marshall_buffer *)data.dptr;
796 r = (struct ctdb_rec_data *)&recs->data[0];
798 if (recs->count == 0) {
799 talloc_free(tmp_ctx);
805 for (v=rec->vacuum_info;v;v=v->next) {
806 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
807 /* we're already working on records from this node */
808 talloc_free(tmp_ctx);
813 /* work out if the database is persistent */
814 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
816 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
817 talloc_free(tmp_ctx);
821 for (i=0;i<dbmap->num;i++) {
822 if (dbmap->dbs[i].dbid == recs->db_id) {
823 persistent = dbmap->dbs[i].persistent;
827 if (i == dbmap->num) {
828 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
829 talloc_free(tmp_ctx);
833 /* find the name of this database */
834 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
835 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
836 talloc_free(tmp_ctx);
841 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
842 if (ctdb_db == NULL) {
843 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
844 talloc_free(tmp_ctx);
848 v = talloc_zero(rec, struct vacuum_info);
850 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
851 talloc_free(tmp_ctx);
856 v->srcnode = srcnode;
857 v->ctdb_db = ctdb_db;
858 v->recs = talloc_memdup(v, recs, data.dsize);
859 if (v->recs == NULL) {
860 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
862 talloc_free(tmp_ctx);
865 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
867 DLIST_ADD(rec->vacuum_info, v);
869 talloc_set_destructor(v, vacuum_info_destructor);
871 vacuum_fetch_next(v);
872 talloc_free(tmp_ctx);
877 called when ctdb_wait_timeout should finish
879 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
880 struct timeval yt, void *p)
882 uint32_t *timed_out = (uint32_t *)p;
887 wait for a given number of seconds
889 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
891 uint32_t timed_out = 0;
892 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
894 event_loop_once(ctdb->ev);
899 called when an election times out (ends)
901 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
902 struct timeval t, void *p)
904 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
905 rec->election_timeout = NULL;
907 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
912 wait for an election to finish. It finished election_timeout seconds after
913 the last election packet is received
915 static void ctdb_wait_election(struct ctdb_recoverd *rec)
917 struct ctdb_context *ctdb = rec->ctdb;
918 while (rec->election_timeout) {
919 event_loop_once(ctdb->ev);
924 Update our local flags from all remote connected nodes.
925 This is only run when we are or we belive we are the recovery master
927 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
930 struct ctdb_context *ctdb = rec->ctdb;
931 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
933 /* get the nodemap for all active remote nodes and verify
934 they are the same as for this node
936 for (j=0; j<nodemap->num; j++) {
937 struct ctdb_node_map *remote_nodemap=NULL;
940 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
943 if (nodemap->nodes[j].pnn == ctdb->pnn) {
947 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
948 mem_ctx, &remote_nodemap);
950 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
951 nodemap->nodes[j].pnn));
952 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
953 talloc_free(mem_ctx);
954 return MONITOR_FAILED;
956 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
957 /* We should tell our daemon about this so it
958 updates its flags or else we will log the same
959 message again in the next iteration of recovery.
960 Since we are the recovery master we can just as
961 well update the flags on all nodes.
963 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
965 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
969 /* Update our local copy of the flags in the recovery
972 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
973 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
974 nodemap->nodes[j].flags));
975 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
977 talloc_free(remote_nodemap);
979 talloc_free(mem_ctx);
984 /* Create a new random generation ip.
985 The generation id can not be the INVALID_GENERATION id
987 static uint32_t new_generation(void)
992 generation = random();
994 if (generation != INVALID_GENERATION) {
1004 create a temporary working database
1006 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1009 struct tdb_wrap *recdb;
1012 /* open up the temporary recovery database */
1013 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1014 ctdb->db_directory_state,
1021 tdb_flags = TDB_NOLOCK;
1022 if (ctdb->valgrinding) {
1023 tdb_flags |= TDB_NOMMAP;
1025 tdb_flags |= TDB_DISALLOW_NESTING;
1027 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1028 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1029 if (recdb == NULL) {
1030 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1040 a traverse function for pulling all relevent records from recdb
1043 struct ctdb_context *ctdb;
1044 struct ctdb_marshall_buffer *recdata;
1050 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1052 struct recdb_data *params = (struct recdb_data *)p;
1053 struct ctdb_rec_data *rec;
1054 struct ctdb_ltdb_header *hdr;
1056 /* skip empty records */
1057 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1061 /* update the dmaster field to point to us */
1062 hdr = (struct ctdb_ltdb_header *)data.dptr;
1063 if (!params->persistent) {
1064 hdr->dmaster = params->ctdb->pnn;
1067 /* add the record to the blob ready to send to the nodes */
1068 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1070 params->failed = true;
1073 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1074 if (params->recdata == NULL) {
1075 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1076 rec->length + params->len, params->recdata->count));
1077 params->failed = true;
1080 params->recdata->count++;
1081 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1082 params->len += rec->length;
1089 push the recdb database out to all nodes
1091 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1093 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1095 struct recdb_data params;
1096 struct ctdb_marshall_buffer *recdata;
1098 TALLOC_CTX *tmp_ctx;
1101 tmp_ctx = talloc_new(ctdb);
1102 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1104 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1105 CTDB_NO_MEMORY(ctdb, recdata);
1107 recdata->db_id = dbid;
1110 params.recdata = recdata;
1111 params.len = offsetof(struct ctdb_marshall_buffer, data);
1112 params.failed = false;
1113 params.persistent = persistent;
1115 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1116 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1117 talloc_free(params.recdata);
1118 talloc_free(tmp_ctx);
1122 if (params.failed) {
1123 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1124 talloc_free(params.recdata);
1125 talloc_free(tmp_ctx);
1129 recdata = params.recdata;
1131 outdata.dptr = (void *)recdata;
1132 outdata.dsize = params.len;
1134 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1135 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1137 CONTROL_TIMEOUT(), false, outdata,
1140 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1141 talloc_free(recdata);
1142 talloc_free(tmp_ctx);
1146 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1147 dbid, recdata->count));
1149 talloc_free(recdata);
1150 talloc_free(tmp_ctx);
1157 go through a full recovery on one database
1159 static int recover_database(struct ctdb_recoverd *rec,
1160 TALLOC_CTX *mem_ctx,
1164 struct ctdb_node_map *nodemap,
1165 uint32_t transaction_id)
1167 struct tdb_wrap *recdb;
1169 struct ctdb_context *ctdb = rec->ctdb;
1171 struct ctdb_control_wipe_database w;
1174 recdb = create_recdb(ctdb, mem_ctx);
1175 if (recdb == NULL) {
1179 /* pull all remote databases onto the recdb */
1180 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1182 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1186 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1188 /* wipe all the remote databases. This is safe as we are in a transaction */
1190 w.transaction_id = transaction_id;
1192 data.dptr = (void *)&w;
1193 data.dsize = sizeof(w);
1195 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1198 CONTROL_TIMEOUT(), false, data,
1201 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1206 /* push out the correct database. This sets the dmaster and skips
1207 the empty records */
1208 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1214 /* all done with this database */
1221 reload the nodes file
1223 static void reload_nodes_file(struct ctdb_context *ctdb)
1226 ctdb_load_nodes_file(ctdb);
1231 we are the recmaster, and recovery is needed - start a recovery run
1233 static int do_recovery(struct ctdb_recoverd *rec,
1234 TALLOC_CTX *mem_ctx, uint32_t pnn,
1235 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1237 struct ctdb_context *ctdb = rec->ctdb;
1239 uint32_t generation;
1240 struct ctdb_dbid_map *dbmap;
1243 struct timeval start_time;
1245 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1247 /* if recovery fails, force it again */
1248 rec->need_recovery = true;
1250 for (i=0; i<ctdb->num_nodes; i++) {
1251 struct ctdb_banning_state *ban_state;
1253 if (ctdb->nodes[i]->ban_state == NULL) {
1256 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1257 if (ban_state->count < 2*ctdb->num_nodes) {
1260 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1261 ctdb->nodes[i]->pnn, ban_state->count,
1262 ctdb->tunable.recovery_ban_period));
1263 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1264 ban_state->count = 0;
1268 if (ctdb->tunable.verify_recovery_lock != 0) {
1269 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1270 start_time = timeval_current();
1271 if (!ctdb_recovery_lock(ctdb, true)) {
1272 ctdb_set_culprit(rec, pnn);
1273 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1276 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1277 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1280 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1282 /* get a list of all databases */
1283 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1285 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1289 /* we do the db creation before we set the recovery mode, so the freeze happens
1290 on all databases we will be dealing with. */
1292 /* verify that we have all the databases any other node has */
1293 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1295 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1299 /* verify that all other nodes have all our databases */
1300 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1302 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1305 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1307 /* update the database priority for all remote databases */
1308 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1310 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1312 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1315 /* set recovery mode to active on all nodes */
1316 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1318 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1322 /* execute the "startrecovery" event script on all nodes */
1323 ret = run_startrecovery_eventscript(rec, nodemap);
1325 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1330 update all nodes to have the same flags that we have
1332 for (i=0;i<nodemap->num;i++) {
1333 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1337 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1339 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1344 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1346 /* pick a new generation number */
1347 generation = new_generation();
1349 /* change the vnnmap on this node to use the new generation
1350 number but not on any other nodes.
1351 this guarantees that if we abort the recovery prematurely
1352 for some reason (a node stops responding?)
1353 that we can just return immediately and we will reenter
1354 recovery shortly again.
1355 I.e. we deliberately leave the cluster with an inconsistent
1356 generation id to allow us to abort recovery at any stage and
1357 just restart it from scratch.
1359 vnnmap->generation = generation;
1360 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1362 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1366 data.dptr = (void *)&generation;
1367 data.dsize = sizeof(uint32_t);
1369 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1370 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1372 CONTROL_TIMEOUT(), false, data,
1374 transaction_start_fail_callback,
1376 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1377 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1379 CONTROL_TIMEOUT(), false, tdb_null,
1383 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1388 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1390 for (i=0;i<dbmap->num;i++) {
1391 ret = recover_database(rec, mem_ctx,
1393 dbmap->dbs[i].persistent,
1394 pnn, nodemap, generation);
1396 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1401 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1403 /* commit all the changes */
1404 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1406 CONTROL_TIMEOUT(), false, data,
1409 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1413 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1416 /* update the capabilities for all nodes */
1417 ret = update_capabilities(ctdb, nodemap);
1419 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1423 /* build a new vnn map with all the currently active and
1425 generation = new_generation();
1426 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1427 CTDB_NO_MEMORY(ctdb, vnnmap);
1428 vnnmap->generation = generation;
1430 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1431 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1432 for (i=j=0;i<nodemap->num;i++) {
1433 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1436 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1437 /* this node can not be an lmaster */
1438 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1443 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1444 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1445 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1448 if (vnnmap->size == 0) {
1449 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1451 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1452 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1453 vnnmap->map[0] = pnn;
1456 /* update to the new vnnmap on all nodes */
1457 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1459 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1463 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1465 /* update recmaster to point to us for all nodes */
1466 ret = set_recovery_master(ctdb, nodemap, pnn);
1468 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1472 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1475 update all nodes to have the same flags that we have
1477 for (i=0;i<nodemap->num;i++) {
1478 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1482 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1484 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1489 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1491 /* disable recovery mode */
1492 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1494 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1498 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1501 tell nodes to takeover their public IPs
1503 rec->need_takeover_run = false;
1504 ret = ctdb_takeover_run(ctdb, nodemap);
1506 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1509 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1511 /* execute the "recovered" event script on all nodes */
1512 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1514 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1518 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1520 /* send a message to all clients telling them that the cluster
1521 has been reconfigured */
1522 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1524 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1526 rec->need_recovery = false;
1528 /* we managed to complete a full recovery, make sure to forgive
1529 any past sins by the nodes that could now participate in the
1532 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1533 for (i=0;i<nodemap->num;i++) {
1534 struct ctdb_banning_state *ban_state;
1536 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1540 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1541 if (ban_state == NULL) {
1545 ban_state->count = 0;
1549 /* We just finished a recovery successfully.
1550 We now wait for rerecovery_timeout before we allow
1551 another recovery to take place.
1553 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1554 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1555 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1562 elections are won by first checking the number of connected nodes, then
1563 the priority time, then the pnn
1565 struct election_message {
1566 uint32_t num_connected;
1567 struct timeval priority_time;
1569 uint32_t node_flags;
1573 form this nodes election data
1575 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1578 struct ctdb_node_map *nodemap;
1579 struct ctdb_context *ctdb = rec->ctdb;
1583 em->pnn = rec->ctdb->pnn;
1584 em->priority_time = rec->priority_time;
1586 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1588 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1592 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1593 em->node_flags = rec->node_flags;
1595 for (i=0;i<nodemap->num;i++) {
1596 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1597 em->num_connected++;
1601 /* we shouldnt try to win this election if we cant be a recmaster */
1602 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1603 em->num_connected = 0;
1604 em->priority_time = timeval_current();
1607 talloc_free(nodemap);
1611 see if the given election data wins
1613 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1615 struct election_message myem;
1618 ctdb_election_data(rec, &myem);
1620 /* we cant win if we dont have the recmaster capability */
1621 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1625 /* we cant win if we are banned */
1626 if (rec->node_flags & NODE_FLAGS_BANNED) {
1630 /* we cant win if we are stopped */
1631 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1635 /* we will automatically win if the other node is banned */
1636 if (em->node_flags & NODE_FLAGS_BANNED) {
1640 /* we will automatically win if the other node is banned */
1641 if (em->node_flags & NODE_FLAGS_STOPPED) {
1645 /* try to use the most connected node */
1647 cmp = (int)myem.num_connected - (int)em->num_connected;
1650 /* then the longest running node */
1652 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1656 cmp = (int)myem.pnn - (int)em->pnn;
1663 send out an election request
1665 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1668 TDB_DATA election_data;
1669 struct election_message emsg;
1671 struct ctdb_context *ctdb = rec->ctdb;
1673 srvid = CTDB_SRVID_RECOVERY;
1675 ctdb_election_data(rec, &emsg);
1677 election_data.dsize = sizeof(struct election_message);
1678 election_data.dptr = (unsigned char *)&emsg;
1681 /* send an election message to all active nodes */
1682 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1683 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1686 /* A new node that is already frozen has entered the cluster.
1687 The existing nodes are not frozen and dont need to be frozen
1688 until the election has ended and we start the actual recovery
1690 if (update_recmaster == true) {
1691 /* first we assume we will win the election and set
1692 recoverymaster to be ourself on the current node
1694 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1696 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1706 this function will unban all nodes in the cluster
1708 static void unban_all_nodes(struct ctdb_context *ctdb)
1711 struct ctdb_node_map *nodemap;
1712 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1714 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1716 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1720 for (i=0;i<nodemap->num;i++) {
1721 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1722 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1723 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1727 talloc_free(tmp_ctx);
1732 we think we are winning the election - send a broadcast election request
1734 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1736 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1739 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1741 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1744 talloc_free(rec->send_election_te);
1745 rec->send_election_te = NULL;
1749 handler for memory dumps
1751 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1752 TDB_DATA data, void *private_data)
1754 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1757 struct rd_memdump_reply *rd;
1759 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1760 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1761 talloc_free(tmp_ctx);
1764 rd = (struct rd_memdump_reply *)data.dptr;
1766 dump = talloc_zero(tmp_ctx, TDB_DATA);
1768 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1769 talloc_free(tmp_ctx);
1772 ret = ctdb_dump_memory(ctdb, dump);
1774 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1775 talloc_free(tmp_ctx);
1779 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1781 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1783 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1784 talloc_free(tmp_ctx);
1788 talloc_free(tmp_ctx);
1792 handler for reload_nodes
1794 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1795 TDB_DATA data, void *private_data)
1797 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1799 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1801 reload_nodes_file(rec->ctdb);
1805 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1806 struct timeval yt, void *p)
1808 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1810 talloc_free(rec->ip_check_disable_ctx);
1811 rec->ip_check_disable_ctx = NULL;
1814 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1815 TDB_DATA data, void *private_data)
1817 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1820 if (rec->ip_check_disable_ctx != NULL) {
1821 talloc_free(rec->ip_check_disable_ctx);
1822 rec->ip_check_disable_ctx = NULL;
1825 if (data.dsize != sizeof(uint32_t)) {
1826 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1827 "expexting %lu\n", (long unsigned)data.dsize,
1828 (long unsigned)sizeof(uint32_t)));
1831 if (data.dptr == NULL) {
1832 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1836 timeout = *((uint32_t *)data.dptr);
1837 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1839 rec->ip_check_disable_ctx = talloc_new(rec);
1840 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1842 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1847 handler for ip reallocate, just add it to the list of callers and
1848 handle this later in the monitor_cluster loop so we do not recurse
1849 with other callers to takeover_run()
1851 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1852 TDB_DATA data, void *private_data)
1854 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1855 struct ip_reallocate_list *caller;
1857 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1858 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1862 if (rec->ip_reallocate_ctx == NULL) {
1863 rec->ip_reallocate_ctx = talloc_new(rec);
1864 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
1867 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1868 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1870 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1871 caller->next = rec->reallocate_callers;
1872 rec->reallocate_callers = caller;
1877 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1879 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1882 struct ip_reallocate_list *callers;
1884 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1885 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1886 result.dsize = sizeof(int32_t);
1887 result.dptr = (uint8_t *)&ret;
1889 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1891 /* Someone that sent srvid==0 does not want a reply */
1892 if (callers->rd->srvid == 0) {
1895 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
1896 "%u:%llu\n", (unsigned)callers->rd->pnn,
1897 (unsigned long long)callers->rd->srvid));
1898 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1900 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
1901 "message to %u:%llu\n",
1902 (unsigned)callers->rd->pnn,
1903 (unsigned long long)callers->rd->srvid));
1907 talloc_free(tmp_ctx);
1908 talloc_free(rec->ip_reallocate_ctx);
1909 rec->ip_reallocate_ctx = NULL;
1910 rec->reallocate_callers = NULL;
1916 handler for recovery master elections
1918 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1919 TDB_DATA data, void *private_data)
1921 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1923 struct election_message *em = (struct election_message *)data.dptr;
1924 TALLOC_CTX *mem_ctx;
1926 /* we got an election packet - update the timeout for the election */
1927 talloc_free(rec->election_timeout);
1928 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1929 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1930 ctdb_election_timeout, rec);
1932 mem_ctx = talloc_new(ctdb);
1934 /* someone called an election. check their election data
1935 and if we disagree and we would rather be the elected node,
1936 send a new election message to all other nodes
1938 if (ctdb_election_win(rec, em)) {
1939 if (!rec->send_election_te) {
1940 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1941 timeval_current_ofs(0, 500000),
1942 election_send_request, rec);
1944 talloc_free(mem_ctx);
1945 /*unban_all_nodes(ctdb);*/
1950 talloc_free(rec->send_election_te);
1951 rec->send_election_te = NULL;
1953 if (ctdb->tunable.verify_recovery_lock != 0) {
1954 /* release the recmaster lock */
1955 if (em->pnn != ctdb->pnn &&
1956 ctdb->recovery_lock_fd != -1) {
1957 close(ctdb->recovery_lock_fd);
1958 ctdb->recovery_lock_fd = -1;
1959 unban_all_nodes(ctdb);
1963 /* ok, let that guy become recmaster then */
1964 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1966 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1967 talloc_free(mem_ctx);
1971 talloc_free(mem_ctx);
1977 force the start of the election process
1979 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1980 struct ctdb_node_map *nodemap)
1983 struct ctdb_context *ctdb = rec->ctdb;
1985 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1987 /* set all nodes to recovery mode to stop all internode traffic */
1988 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1990 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1994 talloc_free(rec->election_timeout);
1995 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1996 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1997 ctdb_election_timeout, rec);
1999 ret = send_election_request(rec, pnn, true);
2001 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2005 /* wait for a few seconds to collect all responses */
2006 ctdb_wait_election(rec);
2012 handler for when a node changes its flags
2014 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2015 TDB_DATA data, void *private_data)
2018 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2019 struct ctdb_node_map *nodemap=NULL;
2020 TALLOC_CTX *tmp_ctx;
2021 uint32_t changed_flags;
2023 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2024 int disabled_flag_changed;
2026 if (data.dsize != sizeof(*c)) {
2027 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2031 tmp_ctx = talloc_new(ctdb);
2032 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2034 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2036 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2037 talloc_free(tmp_ctx);
2042 for (i=0;i<nodemap->num;i++) {
2043 if (nodemap->nodes[i].pnn == c->pnn) break;
2046 if (i == nodemap->num) {
2047 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2048 talloc_free(tmp_ctx);
2052 changed_flags = c->old_flags ^ c->new_flags;
2054 if (nodemap->nodes[i].flags != c->new_flags) {
2055 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2058 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2060 nodemap->nodes[i].flags = c->new_flags;
2062 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2063 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2066 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2067 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2071 ctdb->recovery_master == ctdb->pnn &&
2072 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2073 /* Only do the takeover run if the perm disabled or unhealthy
2074 flags changed since these will cause an ip failover but not
2076 If the node became disconnected or banned this will also
2077 lead to an ip address failover but that is handled
2080 if (disabled_flag_changed) {
2081 rec->need_takeover_run = true;
2085 talloc_free(tmp_ctx);
2089 handler for when we need to push out flag changes ot all other nodes
2091 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2092 TDB_DATA data, void *private_data)
2095 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2096 struct ctdb_node_map *nodemap=NULL;
2097 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2101 /* find the recovery master */
2102 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2104 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2105 talloc_free(tmp_ctx);
2109 /* read the node flags from the recmaster */
2110 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2112 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2113 talloc_free(tmp_ctx);
2116 if (c->pnn >= nodemap->num) {
2117 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2118 talloc_free(tmp_ctx);
2122 /* send the flags update to all connected nodes */
2123 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2125 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2126 nodes, 0, CONTROL_TIMEOUT(),
2130 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2132 talloc_free(tmp_ctx);
2136 talloc_free(tmp_ctx);
2140 struct verify_recmode_normal_data {
2142 enum monitor_result status;
2145 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2147 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2150 /* one more node has responded with recmode data*/
2153 /* if we failed to get the recmode, then return an error and let
2154 the main loop try again.
2156 if (state->state != CTDB_CONTROL_DONE) {
2157 if (rmdata->status == MONITOR_OK) {
2158 rmdata->status = MONITOR_FAILED;
2163 /* if we got a response, then the recmode will be stored in the
2166 if (state->status != CTDB_RECOVERY_NORMAL) {
2167 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2168 rmdata->status = MONITOR_RECOVERY_NEEDED;
2175 /* verify that all nodes are in normal recovery mode */
2176 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2178 struct verify_recmode_normal_data *rmdata;
2179 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2180 struct ctdb_client_control_state *state;
2181 enum monitor_result status;
2184 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2185 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2187 rmdata->status = MONITOR_OK;
2189 /* loop over all active nodes and send an async getrecmode call to
2191 for (j=0; j<nodemap->num; j++) {
2192 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2195 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2197 nodemap->nodes[j].pnn);
2198 if (state == NULL) {
2199 /* we failed to send the control, treat this as
2200 an error and try again next iteration
2202 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2203 talloc_free(mem_ctx);
2204 return MONITOR_FAILED;
2207 /* set up the callback functions */
2208 state->async.fn = verify_recmode_normal_callback;
2209 state->async.private_data = rmdata;
2211 /* one more control to wait for to complete */
2216 /* now wait for up to the maximum number of seconds allowed
2217 or until all nodes we expect a response from has replied
2219 while (rmdata->count > 0) {
2220 event_loop_once(ctdb->ev);
2223 status = rmdata->status;
2224 talloc_free(mem_ctx);
2229 struct verify_recmaster_data {
2230 struct ctdb_recoverd *rec;
2233 enum monitor_result status;
2236 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2238 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2241 /* one more node has responded with recmaster data*/
2244 /* if we failed to get the recmaster, then return an error and let
2245 the main loop try again.
2247 if (state->state != CTDB_CONTROL_DONE) {
2248 if (rmdata->status == MONITOR_OK) {
2249 rmdata->status = MONITOR_FAILED;
2254 /* if we got a response, then the recmaster will be stored in the
2257 if (state->status != rmdata->pnn) {
2258 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2259 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2260 rmdata->status = MONITOR_ELECTION_NEEDED;
2267 /* verify that all nodes agree that we are the recmaster */
2268 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2270 struct ctdb_context *ctdb = rec->ctdb;
2271 struct verify_recmaster_data *rmdata;
2272 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2273 struct ctdb_client_control_state *state;
2274 enum monitor_result status;
2277 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2278 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2282 rmdata->status = MONITOR_OK;
2284 /* loop over all active nodes and send an async getrecmaster call to
2286 for (j=0; j<nodemap->num; j++) {
2287 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2290 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2292 nodemap->nodes[j].pnn);
2293 if (state == NULL) {
2294 /* we failed to send the control, treat this as
2295 an error and try again next iteration
2297 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2298 talloc_free(mem_ctx);
2299 return MONITOR_FAILED;
2302 /* set up the callback functions */
2303 state->async.fn = verify_recmaster_callback;
2304 state->async.private_data = rmdata;
2306 /* one more control to wait for to complete */
2311 /* now wait for up to the maximum number of seconds allowed
2312 or until all nodes we expect a response from has replied
2314 while (rmdata->count > 0) {
2315 event_loop_once(ctdb->ev);
2318 status = rmdata->status;
2319 talloc_free(mem_ctx);
2324 /* called to check that the allocation of public ip addresses is ok.
2326 static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
2328 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2329 struct ctdb_all_public_ips *ips = NULL;
2330 struct ctdb_uptime *uptime1 = NULL;
2331 struct ctdb_uptime *uptime2 = NULL;
2334 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2335 CTDB_CURRENT_NODE, &uptime1);
2337 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2338 talloc_free(mem_ctx);
2342 /* read the ip allocation from the local node */
2343 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2345 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2346 talloc_free(mem_ctx);
2350 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2351 CTDB_CURRENT_NODE, &uptime2);
2353 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2354 talloc_free(mem_ctx);
2358 /* skip the check if the startrecovery time has changed */
2359 if (timeval_compare(&uptime1->last_recovery_started,
2360 &uptime2->last_recovery_started) != 0) {
2361 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2362 talloc_free(mem_ctx);
2366 /* skip the check if the endrecovery time has changed */
2367 if (timeval_compare(&uptime1->last_recovery_finished,
2368 &uptime2->last_recovery_finished) != 0) {
2369 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2370 talloc_free(mem_ctx);
2374 /* skip the check if we have started but not finished recovery */
2375 if (timeval_compare(&uptime1->last_recovery_finished,
2376 &uptime1->last_recovery_started) != 1) {
2377 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2378 talloc_free(mem_ctx);
2383 /* verify that we have the ip addresses we should have
2384 and we dont have ones we shouldnt have.
2385 if we find an inconsistency we set recmode to
2386 active on the local node and wait for the recmaster
2387 to do a full blown recovery
2389 for (j=0; j<ips->num; j++) {
2390 if (ips->ips[j].pnn == pnn) {
2391 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2392 struct takeover_run_reply rd;
2395 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2396 ctdb_addr_to_str(&ips->ips[j].addr)));
2400 data.dptr = (uint8_t *)&rd;
2401 data.dsize = sizeof(rd);
2403 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2405 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2409 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2410 struct takeover_run_reply rd;
2413 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2414 ctdb_addr_to_str(&ips->ips[j].addr)));
2418 data.dptr = (uint8_t *)&rd;
2419 data.dsize = sizeof(rd);
2421 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2423 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2429 talloc_free(mem_ctx);
2434 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2436 struct ctdb_node_map **remote_nodemaps = callback_data;
2438 if (node_pnn >= ctdb->num_nodes) {
2439 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2443 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2447 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2448 struct ctdb_node_map *nodemap,
2449 struct ctdb_node_map **remote_nodemaps)
2453 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2454 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2456 CONTROL_TIMEOUT(), false, tdb_null,
2457 async_getnodemap_callback,
2459 remote_nodemaps) != 0) {
2460 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2468 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2469 struct ctdb_check_reclock_state {
2470 struct ctdb_context *ctdb;
2471 struct timeval start_time;
2474 struct timed_event *te;
2475 struct fd_event *fde;
2476 enum reclock_child_status status;
2479 /* when we free the reclock state we must kill any child process.
2481 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2483 struct ctdb_context *ctdb = state->ctdb;
2485 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2487 if (state->fd[0] != -1) {
2488 close(state->fd[0]);
2491 if (state->fd[1] != -1) {
2492 close(state->fd[1]);
2495 kill(state->child, SIGKILL);
2500 called if our check_reclock child times out. this would happen if
2501 i/o to the reclock file blocks.
2503 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2504 struct timeval t, void *private_data)
2506 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2507 struct ctdb_check_reclock_state);
2509 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2510 state->status = RECLOCK_TIMEOUT;
2513 /* this is called when the child process has completed checking the reclock
2514 file and has written data back to us through the pipe.
2516 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2517 uint16_t flags, void *private_data)
2519 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2520 struct ctdb_check_reclock_state);
2524 /* we got a response from our child process so we can abort the
2527 talloc_free(state->te);
2530 ret = read(state->fd[0], &c, 1);
2531 if (ret != 1 || c != RECLOCK_OK) {
2532 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2533 state->status = RECLOCK_FAILED;
2538 state->status = RECLOCK_OK;
2542 static int check_recovery_lock(struct ctdb_context *ctdb)
2545 struct ctdb_check_reclock_state *state;
2546 pid_t parent = getpid();
2548 if (ctdb->recovery_lock_fd == -1) {
2549 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2553 state = talloc(ctdb, struct ctdb_check_reclock_state);
2554 CTDB_NO_MEMORY(ctdb, state);
2557 state->start_time = timeval_current();
2558 state->status = RECLOCK_CHECKING;
2562 ret = pipe(state->fd);
2565 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2569 state->child = fork();
2570 if (state->child == (pid_t)-1) {
2571 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2572 close(state->fd[0]);
2574 close(state->fd[1]);
2580 if (state->child == 0) {
2581 char cc = RECLOCK_OK;
2582 close(state->fd[0]);
2585 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2586 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2587 cc = RECLOCK_FAILED;
2590 write(state->fd[1], &cc, 1);
2591 /* make sure we die when our parent dies */
2592 while (kill(parent, 0) == 0 || errno != ESRCH) {
2594 write(state->fd[1], &cc, 1);
2598 close(state->fd[1]);
2600 set_close_on_exec(state->fd[0]);
2602 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2604 talloc_set_destructor(state, check_reclock_destructor);
2606 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2607 ctdb_check_reclock_timeout, state);
2608 if (state->te == NULL) {
2609 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2614 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2615 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2616 reclock_child_handler,
2619 if (state->fde == NULL) {
2620 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2625 while (state->status == RECLOCK_CHECKING) {
2626 event_loop_once(ctdb->ev);
2629 if (state->status == RECLOCK_FAILED) {
2630 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2631 close(ctdb->recovery_lock_fd);
2632 ctdb->recovery_lock_fd = -1;
2641 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2643 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2644 const char *reclockfile;
2646 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2647 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2648 talloc_free(tmp_ctx);
2652 if (reclockfile == NULL) {
2653 if (ctdb->recovery_lock_file != NULL) {
2654 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2655 talloc_free(ctdb->recovery_lock_file);
2656 ctdb->recovery_lock_file = NULL;
2657 if (ctdb->recovery_lock_fd != -1) {
2658 close(ctdb->recovery_lock_fd);
2659 ctdb->recovery_lock_fd = -1;
2662 ctdb->tunable.verify_recovery_lock = 0;
2663 talloc_free(tmp_ctx);
2667 if (ctdb->recovery_lock_file == NULL) {
2668 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2669 if (ctdb->recovery_lock_fd != -1) {
2670 close(ctdb->recovery_lock_fd);
2671 ctdb->recovery_lock_fd = -1;
2673 talloc_free(tmp_ctx);
2678 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2679 talloc_free(tmp_ctx);
2683 talloc_free(ctdb->recovery_lock_file);
2684 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2685 ctdb->tunable.verify_recovery_lock = 0;
2686 if (ctdb->recovery_lock_fd != -1) {
2687 close(ctdb->recovery_lock_fd);
2688 ctdb->recovery_lock_fd = -1;
2691 talloc_free(tmp_ctx);
2696 the main monitoring loop
2698 static void monitor_cluster(struct ctdb_context *ctdb)
2701 TALLOC_CTX *mem_ctx=NULL;
2702 struct ctdb_node_map *nodemap=NULL;
2703 struct ctdb_node_map *recmaster_nodemap=NULL;
2704 struct ctdb_node_map **remote_nodemaps=NULL;
2705 struct ctdb_vnn_map *vnnmap=NULL;
2706 struct ctdb_vnn_map *remote_vnnmap=NULL;
2707 int32_t debug_level;
2709 struct ctdb_recoverd *rec;
2711 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2713 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2714 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2718 rec->priority_time = timeval_current();
2720 /* register a message port for sending memory dumps */
2721 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2723 /* register a message port for recovery elections */
2724 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2726 /* when nodes are disabled/enabled */
2727 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2729 /* when we are asked to puch out a flag change */
2730 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2732 /* register a message port for vacuum fetch */
2733 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2735 /* register a message port for reloadnodes */
2736 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2738 /* register a message port for performing a takeover run */
2739 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2741 /* register a message port for disabling the ip check for a short while */
2742 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2746 talloc_free(mem_ctx);
2749 mem_ctx = talloc_new(ctdb);
2751 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2755 /* we only check for recovery once every second */
2756 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2758 /* verify that the main daemon is still running */
2759 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2760 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2764 /* ping the local daemon to tell it we are alive */
2765 ctdb_ctrl_recd_ping(ctdb);
2767 if (rec->election_timeout) {
2768 /* an election is in progress */
2772 /* read the debug level from the parent and update locally */
2773 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2775 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2778 LogLevel = debug_level;
2781 /* We must check if we need to ban a node here but we want to do this
2782 as early as possible so we dont wait until we have pulled the node
2783 map from the local node. thats why we have the hardcoded value 20
2785 for (i=0; i<ctdb->num_nodes; i++) {
2786 struct ctdb_banning_state *ban_state;
2788 if (ctdb->nodes[i]->ban_state == NULL) {
2791 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2792 if (ban_state->count < 20) {
2795 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2796 ctdb->nodes[i]->pnn, ban_state->count,
2797 ctdb->tunable.recovery_ban_period));
2798 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2799 ban_state->count = 0;
2802 /* get relevant tunables */
2803 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2805 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2809 /* get the current recovery lock file from the server */
2810 if (update_recovery_lock_file(ctdb) != 0) {
2811 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2815 /* Make sure that if recovery lock verification becomes disabled when
2818 if (ctdb->tunable.verify_recovery_lock == 0) {
2819 if (ctdb->recovery_lock_fd != -1) {
2820 close(ctdb->recovery_lock_fd);
2821 ctdb->recovery_lock_fd = -1;
2825 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2826 if (pnn == (uint32_t)-1) {
2827 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2831 /* get the vnnmap */
2832 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2834 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2839 /* get number of nodes */
2841 talloc_free(rec->nodemap);
2842 rec->nodemap = NULL;
2845 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2847 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2850 nodemap = rec->nodemap;
2852 /* check which node is the recovery master */
2853 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2855 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2859 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2860 if (rec->recmaster != pnn) {
2861 if (rec->ip_reallocate_ctx != NULL) {
2862 talloc_free(rec->ip_reallocate_ctx);
2863 rec->ip_reallocate_ctx = NULL;
2864 rec->reallocate_callers = NULL;
2867 /* if there are takeovers requested, perform it and notify the waiters */
2868 if (rec->reallocate_callers) {
2869 process_ipreallocate_requests(ctdb, rec);
2872 if (rec->recmaster == (uint32_t)-1) {
2873 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2874 force_election(rec, pnn, nodemap);
2879 /* if the local daemon is STOPPED, we verify that the databases are
2880 also frozen and thet the recmode is set to active
2882 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2883 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2885 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2887 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2888 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2890 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2892 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2895 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2897 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2904 /* If the local node is stopped, verify we are not the recmaster
2905 and yield this role if so
2907 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2908 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2909 force_election(rec, pnn, nodemap);
2913 /* check that we (recovery daemon) and the local ctdb daemon
2914 agrees on whether we are banned or not
2918 /* remember our own node flags */
2919 rec->node_flags = nodemap->nodes[pnn].flags;
2921 /* count how many active nodes there are */
2922 rec->num_active = 0;
2923 rec->num_connected = 0;
2924 for (i=0; i<nodemap->num; i++) {
2925 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2928 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2929 rec->num_connected++;
2934 /* verify that the recmaster node is still active */
2935 for (j=0; j<nodemap->num; j++) {
2936 if (nodemap->nodes[j].pnn==rec->recmaster) {
2941 if (j == nodemap->num) {
2942 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2943 force_election(rec, pnn, nodemap);
2947 /* if recovery master is disconnected we must elect a new recmaster */
2948 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2949 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2950 force_election(rec, pnn, nodemap);
2954 /* grap the nodemap from the recovery master to check if it is banned */
2955 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2956 mem_ctx, &recmaster_nodemap);
2958 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2959 nodemap->nodes[j].pnn));
2964 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2965 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2966 force_election(rec, pnn, nodemap);
2971 /* verify that we have all ip addresses we should have and we dont
2972 * have addresses we shouldnt have.
2974 if (ctdb->do_checkpublicip) {
2975 if (rec->ip_check_disable_ctx == NULL) {
2976 if (verify_ip_allocation(ctdb, rec, pnn) != 0) {
2977 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2983 /* if we are not the recmaster then we do not need to check
2984 if recovery is needed
2986 if (pnn != rec->recmaster) {
2991 /* ensure our local copies of flags are right */
2992 ret = update_local_flags(rec, nodemap);
2993 if (ret == MONITOR_ELECTION_NEEDED) {
2994 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2995 force_election(rec, pnn, nodemap);
2998 if (ret != MONITOR_OK) {
2999 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3003 /* update the list of public ips that a node can handle for
3006 if (ctdb->num_nodes != nodemap->num) {
3007 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3008 reload_nodes_file(ctdb);
3011 for (j=0; j<nodemap->num; j++) {
3012 /* release any existing data */
3013 if (ctdb->nodes[j]->public_ips) {
3014 talloc_free(ctdb->nodes[j]->public_ips);
3015 ctdb->nodes[j]->public_ips = NULL;
3018 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3022 /* grab a new shiny list of public ips from the node */
3023 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
3024 ctdb->nodes[j]->pnn,
3026 &ctdb->nodes[j]->public_ips)) {
3027 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
3028 ctdb->nodes[j]->pnn));
3034 /* verify that all active nodes agree that we are the recmaster */
3035 switch (verify_recmaster(rec, nodemap, pnn)) {
3036 case MONITOR_RECOVERY_NEEDED:
3037 /* can not happen */
3039 case MONITOR_ELECTION_NEEDED:
3040 force_election(rec, pnn, nodemap);
3044 case MONITOR_FAILED:
3049 if (rec->need_recovery) {
3050 /* a previous recovery didn't finish */
3051 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3055 /* verify that all active nodes are in normal mode
3056 and not in recovery mode
3058 switch (verify_recmode(ctdb, nodemap)) {
3059 case MONITOR_RECOVERY_NEEDED:
3060 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3062 case MONITOR_FAILED:
3064 case MONITOR_ELECTION_NEEDED:
3065 /* can not happen */
3071 if (ctdb->tunable.verify_recovery_lock != 0) {
3072 /* we should have the reclock - check its not stale */
3073 ret = check_recovery_lock(ctdb);
3075 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3076 ctdb_set_culprit(rec, ctdb->pnn);
3077 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3082 /* get the nodemap for all active remote nodes
3084 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3085 if (remote_nodemaps == NULL) {
3086 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3089 for(i=0; i<nodemap->num; i++) {
3090 remote_nodemaps[i] = NULL;
3092 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3093 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3097 /* verify that all other nodes have the same nodemap as we have
3099 for (j=0; j<nodemap->num; j++) {
3100 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3104 if (remote_nodemaps[j] == NULL) {
3105 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3106 ctdb_set_culprit(rec, j);
3111 /* if the nodes disagree on how many nodes there are
3112 then this is a good reason to try recovery
3114 if (remote_nodemaps[j]->num != nodemap->num) {
3115 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3116 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3117 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3118 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3122 /* if the nodes disagree on which nodes exist and are
3123 active, then that is also a good reason to do recovery
3125 for (i=0;i<nodemap->num;i++) {
3126 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3127 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3128 nodemap->nodes[j].pnn, i,
3129 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3130 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3131 do_recovery(rec, mem_ctx, pnn, nodemap,
3137 /* verify the flags are consistent
3139 for (i=0; i<nodemap->num; i++) {
3140 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3144 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3145 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3146 nodemap->nodes[j].pnn,
3147 nodemap->nodes[i].pnn,
3148 remote_nodemaps[j]->nodes[i].flags,
3149 nodemap->nodes[j].flags));
3151 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3152 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3153 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3154 do_recovery(rec, mem_ctx, pnn, nodemap,
3158 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3159 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3160 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3161 do_recovery(rec, mem_ctx, pnn, nodemap,
3170 /* there better be the same number of lmasters in the vnn map
3171 as there are active nodes or we will have to do a recovery
3173 if (vnnmap->size != rec->num_active) {
3174 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3175 vnnmap->size, rec->num_active));
3176 ctdb_set_culprit(rec, ctdb->pnn);
3177 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3181 /* verify that all active nodes in the nodemap also exist in
3184 for (j=0; j<nodemap->num; j++) {
3185 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3188 if (nodemap->nodes[j].pnn == pnn) {
3192 for (i=0; i<vnnmap->size; i++) {
3193 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3197 if (i == vnnmap->size) {
3198 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3199 nodemap->nodes[j].pnn));
3200 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3201 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3207 /* verify that all other nodes have the same vnnmap
3208 and are from the same generation
3210 for (j=0; j<nodemap->num; j++) {
3211 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3214 if (nodemap->nodes[j].pnn == pnn) {
3218 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3219 mem_ctx, &remote_vnnmap);
3221 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3222 nodemap->nodes[j].pnn));
3226 /* verify the vnnmap generation is the same */
3227 if (vnnmap->generation != remote_vnnmap->generation) {
3228 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3229 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3230 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3231 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3235 /* verify the vnnmap size is the same */
3236 if (vnnmap->size != remote_vnnmap->size) {
3237 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3238 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3239 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3240 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3244 /* verify the vnnmap is the same */
3245 for (i=0;i<vnnmap->size;i++) {
3246 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3247 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3248 nodemap->nodes[j].pnn));
3249 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3250 do_recovery(rec, mem_ctx, pnn, nodemap,
3257 /* we might need to change who has what IP assigned */
3258 if (rec->need_takeover_run) {
3259 rec->need_takeover_run = false;
3261 /* execute the "startrecovery" event script on all nodes */
3262 ret = run_startrecovery_eventscript(rec, nodemap);
3264 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3265 ctdb_set_culprit(rec, ctdb->pnn);
3266 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3269 ret = ctdb_takeover_run(ctdb, nodemap);
3271 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3272 ctdb_set_culprit(rec, ctdb->pnn);
3273 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3276 /* execute the "recovered" event script on all nodes */
3277 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3279 // we cant check whether the event completed successfully
3280 // since this script WILL fail if the node is in recovery mode
3281 // and if that race happens, the code here would just cause a second
3282 // cascading recovery.
3284 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3285 ctdb_set_culprit(rec, ctdb->pnn);
3286 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3297 event handler for when the main ctdbd dies
3299 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3300 uint16_t flags, void *private_data)
3302 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3307 called regularly to verify that the recovery daemon is still running
3309 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3310 struct timeval yt, void *p)
3312 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3314 if (kill(ctdb->recoverd_pid, 0) != 0) {
3315 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3317 ctdb_stop_recoverd(ctdb);
3318 ctdb_stop_keepalive(ctdb);
3319 ctdb_stop_monitoring(ctdb);
3320 ctdb_release_all_ips(ctdb);
3321 if (ctdb->methods != NULL) {
3322 ctdb->methods->shutdown(ctdb);
3324 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
3329 event_add_timed(ctdb->ev, ctdb,
3330 timeval_current_ofs(30, 0),
3331 ctdb_check_recd, ctdb);
3334 static void recd_sig_child_handler(struct event_context *ev,
3335 struct signal_event *se, int signum, int count,
3339 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3344 pid = waitpid(-1, &status, WNOHANG);
3346 if (errno != ECHILD) {
3347 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3352 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3358 startup the recovery daemon as a child of the main ctdb daemon
3360 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3363 struct signal_event *se;
3365 if (pipe(fd) != 0) {
3369 ctdb->ctdbd_pid = getpid();
3371 ctdb->recoverd_pid = fork();
3372 if (ctdb->recoverd_pid == -1) {
3376 if (ctdb->recoverd_pid != 0) {
3378 event_add_timed(ctdb->ev, ctdb,
3379 timeval_current_ofs(30, 0),
3380 ctdb_check_recd, ctdb);
3386 srandom(getpid() ^ time(NULL));
3388 if (switch_from_server_to_client(ctdb) != 0) {
3389 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3393 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3395 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3396 ctdb_recoverd_parent, &fd[0]);
3398 /* set up a handler to pick up sigchld */
3399 se = event_add_signal(ctdb->ev, ctdb,
3401 recd_sig_child_handler,
3404 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3408 monitor_cluster(ctdb);
3410 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3415 shutdown the recovery daemon
3417 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3419 if (ctdb->recoverd_pid == 0) {
3423 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3424 kill(ctdb->recoverd_pid, SIGTERM);