4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
259 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
261 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
262 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
266 change recovery mode on all nodes
268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
274 tmp_ctx = talloc_new(ctdb);
275 CTDB_NO_MEMORY(ctdb, tmp_ctx);
277 /* freeze all nodes */
278 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
279 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
282 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
288 set_recmode_fail_callback,
290 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
291 talloc_free(tmp_ctx);
298 data.dsize = sizeof(uint32_t);
299 data.dptr = (unsigned char *)&rec_mode;
301 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
308 talloc_free(tmp_ctx);
312 talloc_free(tmp_ctx);
317 change recovery master on all node
319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
325 tmp_ctx = talloc_new(ctdb);
326 CTDB_NO_MEMORY(ctdb, tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&pnn;
331 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
334 CONTROL_TIMEOUT(), false, data,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
346 /* update all remote nodes to use the same db priority that we have
347 this can fail if the remove node has not yet been upgraded to
348 support this function, so we always return success and never fail
349 a recovery if this call fails.
351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
352 struct ctdb_node_map *nodemap,
353 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
358 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
360 /* step through all local databases */
361 for (db=0; db<dbmap->num;db++) {
363 struct ctdb_db_priority db_prio;
366 db_prio.db_id = dbmap->dbs[db].dbid;
367 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
369 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
373 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
375 data.dptr = (uint8_t *)&db_prio;
376 data.dsize = sizeof(db_prio);
378 if (ctdb_client_async_control(ctdb,
379 CTDB_CONTROL_SET_DB_PRIORITY,
381 CONTROL_TIMEOUT(), false, data,
384 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
392 ensure all other nodes have attached to any databases that we have
394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
395 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
398 struct ctdb_dbid_map *remote_dbmap;
400 /* verify that all other nodes have all our databases */
401 for (j=0; j<nodemap->num; j++) {
402 /* we dont need to ourself ourselves */
403 if (nodemap->nodes[j].pnn == pnn) {
406 /* dont check nodes that are unavailable */
407 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
411 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
412 mem_ctx, &remote_dbmap);
414 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
418 /* step through all local databases */
419 for (db=0; db<dbmap->num;db++) {
423 for (i=0;i<remote_dbmap->num;i++) {
424 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
428 /* the remote node already have this database */
429 if (i!=remote_dbmap->num) {
432 /* ok so we need to create this database */
433 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
436 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
439 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, name, dbmap->dbs[db].persistent);
442 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
453 ensure we are attached to any databases that anyone else is attached to
455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
456 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
459 struct ctdb_dbid_map *remote_dbmap;
461 /* verify that we have all database any other node has */
462 for (j=0; j<nodemap->num; j++) {
463 /* we dont need to ourself ourselves */
464 if (nodemap->nodes[j].pnn == pnn) {
467 /* dont check nodes that are unavailable */
468 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
472 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
473 mem_ctx, &remote_dbmap);
475 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
479 /* step through all databases on the remote node */
480 for (db=0; db<remote_dbmap->num;db++) {
483 for (i=0;i<(*dbmap)->num;i++) {
484 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
488 /* we already have this db locally */
489 if (i!=(*dbmap)->num) {
492 /* ok so we need to create this database and
495 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
496 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
498 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
499 nodemap->nodes[j].pnn));
502 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
503 remote_dbmap->dbs[db].persistent);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
521 pull the remote database contents from one node into the recdb
523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
524 struct tdb_wrap *recdb, uint32_t dbid)
528 struct ctdb_marshall_buffer *reply;
529 struct ctdb_rec_data *rec;
531 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
533 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
534 CONTROL_TIMEOUT(), &outdata);
536 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
537 talloc_free(tmp_ctx);
541 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
543 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
544 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
545 talloc_free(tmp_ctx);
549 rec = (struct ctdb_rec_data *)&reply->data[0];
553 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
555 struct ctdb_ltdb_header *hdr;
558 key.dptr = &rec->data[0];
559 key.dsize = rec->keylen;
560 data.dptr = &rec->data[key.dsize];
561 data.dsize = rec->datalen;
563 hdr = (struct ctdb_ltdb_header *)data.dptr;
565 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
566 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
567 talloc_free(tmp_ctx);
571 /* fetch the existing record, if any */
572 existing = tdb_fetch(recdb->tdb, key);
574 if (existing.dptr != NULL) {
575 struct ctdb_ltdb_header header;
576 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
577 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
578 (unsigned)existing.dsize, srcnode));
580 talloc_free(tmp_ctx);
583 header = *(struct ctdb_ltdb_header *)existing.dptr;
585 if (!(header.rsn < hdr->rsn ||
586 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
591 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
592 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
593 talloc_free(tmp_ctx);
598 talloc_free(tmp_ctx);
604 pull all the remote database contents into the recdb
606 static int pull_remote_database(struct ctdb_context *ctdb,
607 struct ctdb_recoverd *rec,
608 struct ctdb_node_map *nodemap,
609 struct tdb_wrap *recdb, uint32_t dbid)
613 /* pull all records from all other nodes across onto this node
614 (this merges based on rsn)
616 for (j=0; j<nodemap->num; j++) {
617 /* dont merge from nodes that are unavailable */
618 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
621 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
622 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
623 nodemap->nodes[j].pnn));
624 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
634 update flags on all active nodes
636 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
640 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
642 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
650 ensure all nodes have the same vnnmap we do
652 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
653 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
657 /* push the new vnn map out to all the nodes */
658 for (j=0; j<nodemap->num; j++) {
659 /* dont push to nodes that are unavailable */
660 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
664 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
666 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
676 struct vacuum_info *next, *prev;
677 struct ctdb_recoverd *rec;
679 struct ctdb_db_context *ctdb_db;
680 struct ctdb_marshall_buffer *recs;
681 struct ctdb_rec_data *r;
684 static void vacuum_fetch_next(struct vacuum_info *v);
687 called when a vacuum fetch has completed - just free it and do the next one
689 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
691 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
693 vacuum_fetch_next(v);
698 process the next element from the vacuum list
700 static void vacuum_fetch_next(struct vacuum_info *v)
702 struct ctdb_call call;
703 struct ctdb_rec_data *r;
705 while (v->recs->count) {
706 struct ctdb_client_call_state *state;
708 struct ctdb_ltdb_header *hdr;
711 call.call_id = CTDB_NULL_FUNC;
712 call.flags = CTDB_IMMEDIATE_MIGRATION;
715 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
718 call.key.dptr = &r->data[0];
719 call.key.dsize = r->keylen;
721 /* ensure we don't block this daemon - just skip a record if we can't get
723 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
727 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
728 if (data.dptr == NULL) {
729 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
733 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
735 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
739 hdr = (struct ctdb_ltdb_header *)data.dptr;
740 if (hdr->dmaster == v->rec->ctdb->pnn) {
741 /* its already local */
743 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
749 state = ctdb_call_send(v->ctdb_db, &call);
750 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
752 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
756 state->async.fn = vacuum_fetch_callback;
757 state->async.private_data = v;
766 destroy a vacuum info structure
768 static int vacuum_info_destructor(struct vacuum_info *v)
770 DLIST_REMOVE(v->rec->vacuum_info, v);
776 handler for vacuum fetch
778 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
779 TDB_DATA data, void *private_data)
781 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
782 struct ctdb_marshall_buffer *recs;
784 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
786 struct ctdb_dbid_map *dbmap=NULL;
787 bool persistent = false;
788 struct ctdb_db_context *ctdb_db;
789 struct ctdb_rec_data *r;
791 struct vacuum_info *v;
793 recs = (struct ctdb_marshall_buffer *)data.dptr;
794 r = (struct ctdb_rec_data *)&recs->data[0];
796 if (recs->count == 0) {
797 talloc_free(tmp_ctx);
803 for (v=rec->vacuum_info;v;v=v->next) {
804 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
805 /* we're already working on records from this node */
806 talloc_free(tmp_ctx);
811 /* work out if the database is persistent */
812 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
814 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
815 talloc_free(tmp_ctx);
819 for (i=0;i<dbmap->num;i++) {
820 if (dbmap->dbs[i].dbid == recs->db_id) {
821 persistent = dbmap->dbs[i].persistent;
825 if (i == dbmap->num) {
826 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
827 talloc_free(tmp_ctx);
831 /* find the name of this database */
832 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
833 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
834 talloc_free(tmp_ctx);
839 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
840 if (ctdb_db == NULL) {
841 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
842 talloc_free(tmp_ctx);
846 v = talloc_zero(rec, struct vacuum_info);
848 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
849 talloc_free(tmp_ctx);
854 v->srcnode = srcnode;
855 v->ctdb_db = ctdb_db;
856 v->recs = talloc_memdup(v, recs, data.dsize);
857 if (v->recs == NULL) {
858 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
860 talloc_free(tmp_ctx);
863 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
865 DLIST_ADD(rec->vacuum_info, v);
867 talloc_set_destructor(v, vacuum_info_destructor);
869 vacuum_fetch_next(v);
870 talloc_free(tmp_ctx);
875 called when ctdb_wait_timeout should finish
877 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
878 struct timeval yt, void *p)
880 uint32_t *timed_out = (uint32_t *)p;
885 wait for a given number of seconds
887 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
889 uint32_t timed_out = 0;
890 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
892 event_loop_once(ctdb->ev);
897 called when an election times out (ends)
899 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
900 struct timeval t, void *p)
902 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
903 rec->election_timeout = NULL;
905 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
910 wait for an election to finish. It finished election_timeout seconds after
911 the last election packet is received
913 static void ctdb_wait_election(struct ctdb_recoverd *rec)
915 struct ctdb_context *ctdb = rec->ctdb;
916 while (rec->election_timeout) {
917 event_loop_once(ctdb->ev);
922 Update our local flags from all remote connected nodes.
923 This is only run when we are or we belive we are the recovery master
925 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
928 struct ctdb_context *ctdb = rec->ctdb;
929 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
931 /* get the nodemap for all active remote nodes and verify
932 they are the same as for this node
934 for (j=0; j<nodemap->num; j++) {
935 struct ctdb_node_map *remote_nodemap=NULL;
938 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
941 if (nodemap->nodes[j].pnn == ctdb->pnn) {
945 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
946 mem_ctx, &remote_nodemap);
948 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
949 nodemap->nodes[j].pnn));
950 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
951 talloc_free(mem_ctx);
952 return MONITOR_FAILED;
954 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
955 /* We should tell our daemon about this so it
956 updates its flags or else we will log the same
957 message again in the next iteration of recovery.
958 Since we are the recovery master we can just as
959 well update the flags on all nodes.
961 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
963 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
967 /* Update our local copy of the flags in the recovery
970 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
971 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
972 nodemap->nodes[j].flags));
973 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
975 talloc_free(remote_nodemap);
977 talloc_free(mem_ctx);
982 /* Create a new random generation ip.
983 The generation id can not be the INVALID_GENERATION id
985 static uint32_t new_generation(void)
990 generation = random();
992 if (generation != INVALID_GENERATION) {
1002 create a temporary working database
1004 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1007 struct tdb_wrap *recdb;
1010 /* open up the temporary recovery database */
1011 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1017 tdb_flags = TDB_NOLOCK;
1018 if (!ctdb->do_setsched) {
1019 tdb_flags |= TDB_NOMMAP;
1022 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1023 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1024 if (recdb == NULL) {
1025 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1035 a traverse function for pulling all relevent records from recdb
1038 struct ctdb_context *ctdb;
1039 struct ctdb_marshall_buffer *recdata;
1044 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1046 struct recdb_data *params = (struct recdb_data *)p;
1047 struct ctdb_rec_data *rec;
1048 struct ctdb_ltdb_header *hdr;
1050 /* skip empty records */
1051 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1055 /* update the dmaster field to point to us */
1056 hdr = (struct ctdb_ltdb_header *)data.dptr;
1057 hdr->dmaster = params->ctdb->pnn;
1059 /* add the record to the blob ready to send to the nodes */
1060 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1062 params->failed = true;
1065 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1066 if (params->recdata == NULL) {
1067 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1068 rec->length + params->len, params->recdata->count));
1069 params->failed = true;
1072 params->recdata->count++;
1073 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1074 params->len += rec->length;
1081 push the recdb database out to all nodes
1083 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1084 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1086 struct recdb_data params;
1087 struct ctdb_marshall_buffer *recdata;
1089 TALLOC_CTX *tmp_ctx;
1092 tmp_ctx = talloc_new(ctdb);
1093 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1095 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1096 CTDB_NO_MEMORY(ctdb, recdata);
1098 recdata->db_id = dbid;
1101 params.recdata = recdata;
1102 params.len = offsetof(struct ctdb_marshall_buffer, data);
1103 params.failed = false;
1105 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1106 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1107 talloc_free(params.recdata);
1108 talloc_free(tmp_ctx);
1112 if (params.failed) {
1113 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1114 talloc_free(params.recdata);
1115 talloc_free(tmp_ctx);
1119 recdata = params.recdata;
1121 outdata.dptr = (void *)recdata;
1122 outdata.dsize = params.len;
1124 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1125 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1127 CONTROL_TIMEOUT(), false, outdata,
1130 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1131 talloc_free(recdata);
1132 talloc_free(tmp_ctx);
1136 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1137 dbid, recdata->count));
1139 talloc_free(recdata);
1140 talloc_free(tmp_ctx);
1147 go through a full recovery on one database
1149 static int recover_database(struct ctdb_recoverd *rec,
1150 TALLOC_CTX *mem_ctx,
1153 struct ctdb_node_map *nodemap,
1154 uint32_t transaction_id)
1156 struct tdb_wrap *recdb;
1158 struct ctdb_context *ctdb = rec->ctdb;
1160 struct ctdb_control_wipe_database w;
1163 recdb = create_recdb(ctdb, mem_ctx);
1164 if (recdb == NULL) {
1168 /* pull all remote databases onto the recdb */
1169 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1171 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1175 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1177 /* wipe all the remote databases. This is safe as we are in a transaction */
1179 w.transaction_id = transaction_id;
1181 data.dptr = (void *)&w;
1182 data.dsize = sizeof(w);
1184 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1185 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1187 CONTROL_TIMEOUT(), false, data,
1190 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1195 /* push out the correct database. This sets the dmaster and skips
1196 the empty records */
1197 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1203 /* all done with this database */
1210 reload the nodes file
1212 static void reload_nodes_file(struct ctdb_context *ctdb)
1215 ctdb_load_nodes_file(ctdb);
1220 we are the recmaster, and recovery is needed - start a recovery run
1222 static int do_recovery(struct ctdb_recoverd *rec,
1223 TALLOC_CTX *mem_ctx, uint32_t pnn,
1224 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1226 struct ctdb_context *ctdb = rec->ctdb;
1228 uint32_t generation;
1229 struct ctdb_dbid_map *dbmap;
1232 struct timeval start_time;
1234 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1236 /* if recovery fails, force it again */
1237 rec->need_recovery = true;
1239 for (i=0; i<ctdb->num_nodes; i++) {
1240 struct ctdb_banning_state *ban_state;
1242 if (ctdb->nodes[i]->ban_state == NULL) {
1245 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1246 if (ban_state->count < 2*ctdb->num_nodes) {
1249 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1250 ctdb->nodes[i]->pnn, ban_state->count,
1251 ctdb->tunable.recovery_ban_period));
1252 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1253 ban_state->count = 0;
1257 if (ctdb->tunable.verify_recovery_lock != 0) {
1258 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1259 start_time = timeval_current();
1260 if (!ctdb_recovery_lock(ctdb, true)) {
1261 ctdb_set_culprit(rec, pnn);
1262 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1265 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1266 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1269 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1271 /* get a list of all databases */
1272 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1274 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1278 /* we do the db creation before we set the recovery mode, so the freeze happens
1279 on all databases we will be dealing with. */
1281 /* verify that we have all the databases any other node has */
1282 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1284 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1288 /* verify that all other nodes have all our databases */
1289 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1291 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1294 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1296 /* update the database priority for all remote databases */
1297 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1299 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1301 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1304 /* set recovery mode to active on all nodes */
1305 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1311 /* execute the "startrecovery" event script on all nodes */
1312 ret = run_startrecovery_eventscript(rec, nodemap);
1314 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1318 /* pick a new generation number */
1319 generation = new_generation();
1321 /* change the vnnmap on this node to use the new generation
1322 number but not on any other nodes.
1323 this guarantees that if we abort the recovery prematurely
1324 for some reason (a node stops responding?)
1325 that we can just return immediately and we will reenter
1326 recovery shortly again.
1327 I.e. we deliberately leave the cluster with an inconsistent
1328 generation id to allow us to abort recovery at any stage and
1329 just restart it from scratch.
1331 vnnmap->generation = generation;
1332 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1334 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1338 data.dptr = (void *)&generation;
1339 data.dsize = sizeof(uint32_t);
1341 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1342 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1344 CONTROL_TIMEOUT(), false, data,
1346 transaction_start_fail_callback,
1348 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1349 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1351 CONTROL_TIMEOUT(), false, tdb_null,
1355 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1360 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1362 for (i=0;i<dbmap->num;i++) {
1363 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1364 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1369 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1371 /* commit all the changes */
1372 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1374 CONTROL_TIMEOUT(), false, data,
1377 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1381 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1384 /* update the capabilities for all nodes */
1385 ret = update_capabilities(ctdb, nodemap);
1387 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1391 /* build a new vnn map with all the currently active and
1393 generation = new_generation();
1394 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1395 CTDB_NO_MEMORY(ctdb, vnnmap);
1396 vnnmap->generation = generation;
1398 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1399 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1400 for (i=j=0;i<nodemap->num;i++) {
1401 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1404 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1405 /* this node can not be an lmaster */
1406 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1411 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1412 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1413 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1416 if (vnnmap->size == 0) {
1417 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1419 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1420 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1421 vnnmap->map[0] = pnn;
1424 /* update to the new vnnmap on all nodes */
1425 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1427 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1431 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1433 /* update recmaster to point to us for all nodes */
1434 ret = set_recovery_master(ctdb, nodemap, pnn);
1436 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1440 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1443 update all nodes to have the same flags that we have
1445 for (i=0;i<nodemap->num;i++) {
1446 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1450 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1452 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1457 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1459 /* disable recovery mode */
1460 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1462 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1466 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1469 tell nodes to takeover their public IPs
1471 rec->need_takeover_run = false;
1472 ret = ctdb_takeover_run(ctdb, nodemap);
1474 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1477 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1479 /* execute the "recovered" event script on all nodes */
1480 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1482 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1486 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1488 /* send a message to all clients telling them that the cluster
1489 has been reconfigured */
1490 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1492 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1494 rec->need_recovery = false;
1496 /* we managed to complete a full recovery, make sure to forgive
1497 any past sins by the nodes that could now participate in the
1500 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1501 for (i=0;i<nodemap->num;i++) {
1502 struct ctdb_banning_state *ban_state;
1504 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1508 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1509 if (ban_state == NULL) {
1513 ban_state->count = 0;
1517 /* We just finished a recovery successfully.
1518 We now wait for rerecovery_timeout before we allow
1519 another recovery to take place.
1521 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1522 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1523 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1530 elections are won by first checking the number of connected nodes, then
1531 the priority time, then the pnn
1533 struct election_message {
1534 uint32_t num_connected;
1535 struct timeval priority_time;
1537 uint32_t node_flags;
1541 form this nodes election data
1543 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1546 struct ctdb_node_map *nodemap;
1547 struct ctdb_context *ctdb = rec->ctdb;
1551 em->pnn = rec->ctdb->pnn;
1552 em->priority_time = rec->priority_time;
1554 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1556 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1560 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1561 em->node_flags = rec->node_flags;
1563 for (i=0;i<nodemap->num;i++) {
1564 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1565 em->num_connected++;
1569 /* we shouldnt try to win this election if we cant be a recmaster */
1570 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1571 em->num_connected = 0;
1572 em->priority_time = timeval_current();
1575 talloc_free(nodemap);
1579 see if the given election data wins
1581 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1583 struct election_message myem;
1586 ctdb_election_data(rec, &myem);
1588 /* we cant win if we dont have the recmaster capability */
1589 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1593 /* we cant win if we are banned */
1594 if (rec->node_flags & NODE_FLAGS_BANNED) {
1598 /* we cant win if we are stopped */
1599 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1603 /* we will automatically win if the other node is banned */
1604 if (em->node_flags & NODE_FLAGS_BANNED) {
1608 /* we will automatically win if the other node is banned */
1609 if (em->node_flags & NODE_FLAGS_STOPPED) {
1613 /* try to use the most connected node */
1615 cmp = (int)myem.num_connected - (int)em->num_connected;
1618 /* then the longest running node */
1620 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1624 cmp = (int)myem.pnn - (int)em->pnn;
1631 send out an election request
1633 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1636 TDB_DATA election_data;
1637 struct election_message emsg;
1639 struct ctdb_context *ctdb = rec->ctdb;
1641 srvid = CTDB_SRVID_RECOVERY;
1643 ctdb_election_data(rec, &emsg);
1645 election_data.dsize = sizeof(struct election_message);
1646 election_data.dptr = (unsigned char *)&emsg;
1649 /* send an election message to all active nodes */
1650 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1651 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1654 /* A new node that is already frozen has entered the cluster.
1655 The existing nodes are not frozen and dont need to be frozen
1656 until the election has ended and we start the actual recovery
1658 if (update_recmaster == true) {
1659 /* first we assume we will win the election and set
1660 recoverymaster to be ourself on the current node
1662 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1664 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1674 this function will unban all nodes in the cluster
1676 static void unban_all_nodes(struct ctdb_context *ctdb)
1679 struct ctdb_node_map *nodemap;
1680 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1682 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1684 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1688 for (i=0;i<nodemap->num;i++) {
1689 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1690 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1691 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1695 talloc_free(tmp_ctx);
1700 we think we are winning the election - send a broadcast election request
1702 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1704 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1707 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1709 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1712 talloc_free(rec->send_election_te);
1713 rec->send_election_te = NULL;
1717 handler for memory dumps
1719 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1720 TDB_DATA data, void *private_data)
1722 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1725 struct rd_memdump_reply *rd;
1727 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1728 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1729 talloc_free(tmp_ctx);
1732 rd = (struct rd_memdump_reply *)data.dptr;
1734 dump = talloc_zero(tmp_ctx, TDB_DATA);
1736 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1737 talloc_free(tmp_ctx);
1740 ret = ctdb_dump_memory(ctdb, dump);
1742 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1743 talloc_free(tmp_ctx);
1747 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1749 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1751 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1752 talloc_free(tmp_ctx);
1756 talloc_free(tmp_ctx);
1760 handler for reload_nodes
1762 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1763 TDB_DATA data, void *private_data)
1765 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1767 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1769 reload_nodes_file(rec->ctdb);
1773 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1774 struct timeval yt, void *p)
1776 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1778 talloc_free(rec->ip_check_disable_ctx);
1779 rec->ip_check_disable_ctx = NULL;
1782 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1783 TDB_DATA data, void *private_data)
1785 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1788 if (rec->ip_check_disable_ctx != NULL) {
1789 talloc_free(rec->ip_check_disable_ctx);
1790 rec->ip_check_disable_ctx = NULL;
1793 if (data.dsize != sizeof(uint32_t)) {
1794 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu expexting %lu\n", data.dsize, sizeof(uint32_t)));
1797 if (data.dptr == NULL) {
1798 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1802 timeout = *((uint32_t *)data.dptr);
1803 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1805 rec->ip_check_disable_ctx = talloc_new(rec);
1806 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1808 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1813 handler for ip reallocate, just add it to the list of callers and
1814 handle this later in the monitor_cluster loop so we do not recurse
1815 with other callers to takeover_run()
1817 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1818 TDB_DATA data, void *private_data)
1820 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1821 struct ip_reallocate_list *caller;
1823 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1824 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1828 if (rec->ip_reallocate_ctx == NULL) {
1829 rec->ip_reallocate_ctx = talloc_new(rec);
1830 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1833 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1834 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1836 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1837 caller->next = rec->reallocate_callers;
1838 rec->reallocate_callers = caller;
1843 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1845 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1848 struct ip_reallocate_list *callers;
1850 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1851 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1852 result.dsize = sizeof(int32_t);
1853 result.dptr = (uint8_t *)&ret;
1855 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1856 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1857 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1859 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1863 talloc_free(tmp_ctx);
1864 talloc_free(rec->ip_reallocate_ctx);
1865 rec->ip_reallocate_ctx = NULL;
1866 rec->reallocate_callers = NULL;
1872 handler for recovery master elections
1874 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1875 TDB_DATA data, void *private_data)
1877 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1879 struct election_message *em = (struct election_message *)data.dptr;
1880 TALLOC_CTX *mem_ctx;
1882 /* we got an election packet - update the timeout for the election */
1883 talloc_free(rec->election_timeout);
1884 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1885 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1886 ctdb_election_timeout, rec);
1888 mem_ctx = talloc_new(ctdb);
1890 /* someone called an election. check their election data
1891 and if we disagree and we would rather be the elected node,
1892 send a new election message to all other nodes
1894 if (ctdb_election_win(rec, em)) {
1895 if (!rec->send_election_te) {
1896 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1897 timeval_current_ofs(0, 500000),
1898 election_send_request, rec);
1900 talloc_free(mem_ctx);
1901 /*unban_all_nodes(ctdb);*/
1906 talloc_free(rec->send_election_te);
1907 rec->send_election_te = NULL;
1909 if (ctdb->tunable.verify_recovery_lock != 0) {
1910 /* release the recmaster lock */
1911 if (em->pnn != ctdb->pnn &&
1912 ctdb->recovery_lock_fd != -1) {
1913 close(ctdb->recovery_lock_fd);
1914 ctdb->recovery_lock_fd = -1;
1915 unban_all_nodes(ctdb);
1919 /* ok, let that guy become recmaster then */
1920 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1922 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1923 talloc_free(mem_ctx);
1927 talloc_free(mem_ctx);
1933 force the start of the election process
1935 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1936 struct ctdb_node_map *nodemap)
1939 struct ctdb_context *ctdb = rec->ctdb;
1941 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1943 /* set all nodes to recovery mode to stop all internode traffic */
1944 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1946 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1950 talloc_free(rec->election_timeout);
1951 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1952 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1953 ctdb_election_timeout, rec);
1955 ret = send_election_request(rec, pnn, true);
1957 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1961 /* wait for a few seconds to collect all responses */
1962 ctdb_wait_election(rec);
1968 handler for when a node changes its flags
1970 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1971 TDB_DATA data, void *private_data)
1974 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1975 struct ctdb_node_map *nodemap=NULL;
1976 TALLOC_CTX *tmp_ctx;
1977 uint32_t changed_flags;
1979 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1980 int disabled_flag_changed;
1982 if (data.dsize != sizeof(*c)) {
1983 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1987 tmp_ctx = talloc_new(ctdb);
1988 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1990 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1992 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1993 talloc_free(tmp_ctx);
1998 for (i=0;i<nodemap->num;i++) {
1999 if (nodemap->nodes[i].pnn == c->pnn) break;
2002 if (i == nodemap->num) {
2003 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2004 talloc_free(tmp_ctx);
2008 changed_flags = c->old_flags ^ c->new_flags;
2010 if (nodemap->nodes[i].flags != c->new_flags) {
2011 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2014 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2016 nodemap->nodes[i].flags = c->new_flags;
2018 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2019 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2022 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2023 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2027 ctdb->recovery_master == ctdb->pnn &&
2028 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2029 /* Only do the takeover run if the perm disabled or unhealthy
2030 flags changed since these will cause an ip failover but not
2032 If the node became disconnected or banned this will also
2033 lead to an ip address failover but that is handled
2036 if (disabled_flag_changed) {
2037 rec->need_takeover_run = true;
2041 talloc_free(tmp_ctx);
2045 handler for when we need to push out flag changes ot all other nodes
2047 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2048 TDB_DATA data, void *private_data)
2051 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2053 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2055 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2060 struct verify_recmode_normal_data {
2062 enum monitor_result status;
2065 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2067 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2070 /* one more node has responded with recmode data*/
2073 /* if we failed to get the recmode, then return an error and let
2074 the main loop try again.
2076 if (state->state != CTDB_CONTROL_DONE) {
2077 if (rmdata->status == MONITOR_OK) {
2078 rmdata->status = MONITOR_FAILED;
2083 /* if we got a response, then the recmode will be stored in the
2086 if (state->status != CTDB_RECOVERY_NORMAL) {
2087 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2088 rmdata->status = MONITOR_RECOVERY_NEEDED;
2095 /* verify that all nodes are in normal recovery mode */
2096 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2098 struct verify_recmode_normal_data *rmdata;
2099 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2100 struct ctdb_client_control_state *state;
2101 enum monitor_result status;
2104 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2105 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2107 rmdata->status = MONITOR_OK;
2109 /* loop over all active nodes and send an async getrecmode call to
2111 for (j=0; j<nodemap->num; j++) {
2112 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2115 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2117 nodemap->nodes[j].pnn);
2118 if (state == NULL) {
2119 /* we failed to send the control, treat this as
2120 an error and try again next iteration
2122 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2123 talloc_free(mem_ctx);
2124 return MONITOR_FAILED;
2127 /* set up the callback functions */
2128 state->async.fn = verify_recmode_normal_callback;
2129 state->async.private_data = rmdata;
2131 /* one more control to wait for to complete */
2136 /* now wait for up to the maximum number of seconds allowed
2137 or until all nodes we expect a response from has replied
2139 while (rmdata->count > 0) {
2140 event_loop_once(ctdb->ev);
2143 status = rmdata->status;
2144 talloc_free(mem_ctx);
2149 struct verify_recmaster_data {
2150 struct ctdb_recoverd *rec;
2153 enum monitor_result status;
2156 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2158 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2161 /* one more node has responded with recmaster data*/
2164 /* if we failed to get the recmaster, then return an error and let
2165 the main loop try again.
2167 if (state->state != CTDB_CONTROL_DONE) {
2168 if (rmdata->status == MONITOR_OK) {
2169 rmdata->status = MONITOR_FAILED;
2174 /* if we got a response, then the recmaster will be stored in the
2177 if (state->status != rmdata->pnn) {
2178 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2179 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2180 rmdata->status = MONITOR_ELECTION_NEEDED;
2187 /* verify that all nodes agree that we are the recmaster */
2188 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2190 struct ctdb_context *ctdb = rec->ctdb;
2191 struct verify_recmaster_data *rmdata;
2192 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2193 struct ctdb_client_control_state *state;
2194 enum monitor_result status;
2197 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2198 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2202 rmdata->status = MONITOR_OK;
2204 /* loop over all active nodes and send an async getrecmaster call to
2206 for (j=0; j<nodemap->num; j++) {
2207 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2210 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2212 nodemap->nodes[j].pnn);
2213 if (state == NULL) {
2214 /* we failed to send the control, treat this as
2215 an error and try again next iteration
2217 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2218 talloc_free(mem_ctx);
2219 return MONITOR_FAILED;
2222 /* set up the callback functions */
2223 state->async.fn = verify_recmaster_callback;
2224 state->async.private_data = rmdata;
2226 /* one more control to wait for to complete */
2231 /* now wait for up to the maximum number of seconds allowed
2232 or until all nodes we expect a response from has replied
2234 while (rmdata->count > 0) {
2235 event_loop_once(ctdb->ev);
2238 status = rmdata->status;
2239 talloc_free(mem_ctx);
2244 /* called to check that the allocation of public ip addresses is ok.
2246 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2248 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2249 struct ctdb_all_public_ips *ips = NULL;
2250 struct ctdb_uptime *uptime1 = NULL;
2251 struct ctdb_uptime *uptime2 = NULL;
2254 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2255 CTDB_CURRENT_NODE, &uptime1);
2257 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2258 talloc_free(mem_ctx);
2262 /* read the ip allocation from the local node */
2263 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2265 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2266 talloc_free(mem_ctx);
2270 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2271 CTDB_CURRENT_NODE, &uptime2);
2273 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2274 talloc_free(mem_ctx);
2278 /* skip the check if the startrecovery time has changed */
2279 if (timeval_compare(&uptime1->last_recovery_started,
2280 &uptime2->last_recovery_started) != 0) {
2281 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2282 talloc_free(mem_ctx);
2286 /* skip the check if the endrecovery time has changed */
2287 if (timeval_compare(&uptime1->last_recovery_finished,
2288 &uptime2->last_recovery_finished) != 0) {
2289 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2290 talloc_free(mem_ctx);
2294 /* skip the check if we have started but not finished recovery */
2295 if (timeval_compare(&uptime1->last_recovery_finished,
2296 &uptime1->last_recovery_started) != 1) {
2297 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2298 talloc_free(mem_ctx);
2303 /* verify that we have the ip addresses we should have
2304 and we dont have ones we shouldnt have.
2305 if we find an inconsistency we set recmode to
2306 active on the local node and wait for the recmaster
2307 to do a full blown recovery
2309 for (j=0; j<ips->num; j++) {
2310 if (ips->ips[j].pnn == pnn) {
2311 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2312 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2313 ctdb_addr_to_str(&ips->ips[j].addr)));
2314 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2316 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2318 talloc_free(mem_ctx);
2321 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2323 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2325 talloc_free(mem_ctx);
2330 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2331 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2332 ctdb_addr_to_str(&ips->ips[j].addr)));
2334 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2336 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2338 talloc_free(mem_ctx);
2341 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2343 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2345 talloc_free(mem_ctx);
2352 talloc_free(mem_ctx);
2357 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2359 struct ctdb_node_map **remote_nodemaps = callback_data;
2361 if (node_pnn >= ctdb->num_nodes) {
2362 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2366 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2370 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2371 struct ctdb_node_map *nodemap,
2372 struct ctdb_node_map **remote_nodemaps)
2376 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2377 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2379 CONTROL_TIMEOUT(), false, tdb_null,
2380 async_getnodemap_callback,
2382 remote_nodemaps) != 0) {
2383 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2391 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2392 struct ctdb_check_reclock_state {
2393 struct ctdb_context *ctdb;
2394 struct timeval start_time;
2397 struct timed_event *te;
2398 struct fd_event *fde;
2399 enum reclock_child_status status;
2402 /* when we free the reclock state we must kill any child process.
2404 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2406 struct ctdb_context *ctdb = state->ctdb;
2408 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2410 if (state->fd[0] != -1) {
2411 close(state->fd[0]);
2414 if (state->fd[1] != -1) {
2415 close(state->fd[1]);
2418 kill(state->child, SIGKILL);
2423 called if our check_reclock child times out. this would happen if
2424 i/o to the reclock file blocks.
2426 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2427 struct timeval t, void *private_data)
2429 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2430 struct ctdb_check_reclock_state);
2432 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2433 state->status = RECLOCK_TIMEOUT;
2436 /* this is called when the child process has completed checking the reclock
2437 file and has written data back to us through the pipe.
2439 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2440 uint16_t flags, void *private_data)
2442 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2443 struct ctdb_check_reclock_state);
2447 /* we got a response from our child process so we can abort the
2450 talloc_free(state->te);
2453 ret = read(state->fd[0], &c, 1);
2454 if (ret != 1 || c != RECLOCK_OK) {
2455 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2456 state->status = RECLOCK_FAILED;
2461 state->status = RECLOCK_OK;
2465 static int check_recovery_lock(struct ctdb_context *ctdb)
2468 struct ctdb_check_reclock_state *state;
2469 pid_t parent = getpid();
2471 if (ctdb->recovery_lock_fd == -1) {
2472 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2476 state = talloc(ctdb, struct ctdb_check_reclock_state);
2477 CTDB_NO_MEMORY(ctdb, state);
2480 state->start_time = timeval_current();
2481 state->status = RECLOCK_CHECKING;
2485 ret = pipe(state->fd);
2488 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2492 state->child = fork();
2493 if (state->child == (pid_t)-1) {
2494 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2495 close(state->fd[0]);
2497 close(state->fd[1]);
2503 if (state->child == 0) {
2504 char cc = RECLOCK_OK;
2505 close(state->fd[0]);
2508 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2509 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2510 cc = RECLOCK_FAILED;
2513 write(state->fd[1], &cc, 1);
2514 /* make sure we die when our parent dies */
2515 while (kill(parent, 0) == 0 || errno != ESRCH) {
2517 write(state->fd[1], &cc, 1);
2521 close(state->fd[1]);
2524 talloc_set_destructor(state, check_reclock_destructor);
2526 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2527 ctdb_check_reclock_timeout, state);
2528 if (state->te == NULL) {
2529 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2534 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2535 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2536 reclock_child_handler,
2539 if (state->fde == NULL) {
2540 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2545 while (state->status == RECLOCK_CHECKING) {
2546 event_loop_once(ctdb->ev);
2549 if (state->status == RECLOCK_FAILED) {
2550 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2551 close(ctdb->recovery_lock_fd);
2552 ctdb->recovery_lock_fd = -1;
2561 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2563 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2564 const char *reclockfile;
2566 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2567 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2568 talloc_free(tmp_ctx);
2572 if (reclockfile == NULL) {
2573 if (ctdb->recovery_lock_file != NULL) {
2574 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2575 talloc_free(ctdb->recovery_lock_file);
2576 ctdb->recovery_lock_file = NULL;
2577 if (ctdb->recovery_lock_fd != -1) {
2578 close(ctdb->recovery_lock_fd);
2579 ctdb->recovery_lock_fd = -1;
2582 ctdb->tunable.verify_recovery_lock = 0;
2583 talloc_free(tmp_ctx);
2587 if (ctdb->recovery_lock_file == NULL) {
2588 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2589 if (ctdb->recovery_lock_fd != -1) {
2590 close(ctdb->recovery_lock_fd);
2591 ctdb->recovery_lock_fd = -1;
2593 talloc_free(tmp_ctx);
2598 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2599 talloc_free(tmp_ctx);
2603 talloc_free(ctdb->recovery_lock_file);
2604 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2605 ctdb->tunable.verify_recovery_lock = 0;
2606 if (ctdb->recovery_lock_fd != -1) {
2607 close(ctdb->recovery_lock_fd);
2608 ctdb->recovery_lock_fd = -1;
2611 talloc_free(tmp_ctx);
2616 the main monitoring loop
2618 static void monitor_cluster(struct ctdb_context *ctdb)
2621 TALLOC_CTX *mem_ctx=NULL;
2622 struct ctdb_node_map *nodemap=NULL;
2623 struct ctdb_node_map *recmaster_nodemap=NULL;
2624 struct ctdb_node_map **remote_nodemaps=NULL;
2625 struct ctdb_vnn_map *vnnmap=NULL;
2626 struct ctdb_vnn_map *remote_vnnmap=NULL;
2627 int32_t debug_level;
2629 struct ctdb_recoverd *rec;
2631 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2633 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2634 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2638 rec->priority_time = timeval_current();
2640 /* register a message port for sending memory dumps */
2641 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2643 /* register a message port for recovery elections */
2644 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2646 /* when nodes are disabled/enabled */
2647 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2649 /* when we are asked to puch out a flag change */
2650 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2652 /* register a message port for vacuum fetch */
2653 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2655 /* register a message port for reloadnodes */
2656 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2658 /* register a message port for performing a takeover run */
2659 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2661 /* register a message port for disabling the ip check for a short while */
2662 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2666 talloc_free(mem_ctx);
2669 mem_ctx = talloc_new(ctdb);
2671 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2675 /* we only check for recovery once every second */
2676 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2678 /* verify that the main daemon is still running */
2679 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2680 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2684 /* ping the local daemon to tell it we are alive */
2685 ctdb_ctrl_recd_ping(ctdb);
2687 if (rec->election_timeout) {
2688 /* an election is in progress */
2692 /* read the debug level from the parent and update locally */
2693 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2695 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2698 LogLevel = debug_level;
2701 /* We must check if we need to ban a node here but we want to do this
2702 as early as possible so we dont wait until we have pulled the node
2703 map from the local node. thats why we have the hardcoded value 20
2705 for (i=0; i<ctdb->num_nodes; i++) {
2706 struct ctdb_banning_state *ban_state;
2708 if (ctdb->nodes[i]->ban_state == NULL) {
2711 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2712 if (ban_state->count < 20) {
2715 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2716 ctdb->nodes[i]->pnn, ban_state->count,
2717 ctdb->tunable.recovery_ban_period));
2718 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2719 ban_state->count = 0;
2722 /* get relevant tunables */
2723 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2725 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2729 /* get the current recovery lock file from the server */
2730 if (update_recovery_lock_file(ctdb) != 0) {
2731 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2735 /* Make sure that if recovery lock verification becomes disabled when
2738 if (ctdb->tunable.verify_recovery_lock == 0) {
2739 if (ctdb->recovery_lock_fd != -1) {
2740 close(ctdb->recovery_lock_fd);
2741 ctdb->recovery_lock_fd = -1;
2745 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2746 if (pnn == (uint32_t)-1) {
2747 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2751 /* get the vnnmap */
2752 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2754 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2759 /* get number of nodes */
2761 talloc_free(rec->nodemap);
2762 rec->nodemap = NULL;
2765 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2767 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2770 nodemap = rec->nodemap;
2772 /* check which node is the recovery master */
2773 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2775 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2779 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2780 if (rec->recmaster != pnn) {
2781 if (rec->ip_reallocate_ctx != NULL) {
2782 talloc_free(rec->ip_reallocate_ctx);
2783 rec->ip_reallocate_ctx = NULL;
2784 rec->reallocate_callers = NULL;
2787 /* if there are takeovers requested, perform it and notify the waiters */
2788 if (rec->reallocate_callers) {
2789 process_ipreallocate_requests(ctdb, rec);
2792 if (rec->recmaster == (uint32_t)-1) {
2793 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2794 force_election(rec, pnn, nodemap);
2799 /* if the local daemon is STOPPED, we verify that the databases are
2800 also frozen and thet the recmode is set to active
2802 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2803 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2805 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2807 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2808 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2810 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2812 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2815 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2817 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2824 /* If the local node is stopped, verify we are not the recmaster
2825 and yield this role if so
2827 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2828 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2829 force_election(rec, pnn, nodemap);
2833 /* check that we (recovery daemon) and the local ctdb daemon
2834 agrees on whether we are banned or not
2838 /* remember our own node flags */
2839 rec->node_flags = nodemap->nodes[pnn].flags;
2841 /* count how many active nodes there are */
2842 rec->num_active = 0;
2843 rec->num_connected = 0;
2844 for (i=0; i<nodemap->num; i++) {
2845 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2848 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2849 rec->num_connected++;
2854 /* verify that the recmaster node is still active */
2855 for (j=0; j<nodemap->num; j++) {
2856 if (nodemap->nodes[j].pnn==rec->recmaster) {
2861 if (j == nodemap->num) {
2862 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2863 force_election(rec, pnn, nodemap);
2867 /* if recovery master is disconnected we must elect a new recmaster */
2868 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2869 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2870 force_election(rec, pnn, nodemap);
2874 /* grap the nodemap from the recovery master to check if it is banned */
2875 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2876 mem_ctx, &recmaster_nodemap);
2878 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2879 nodemap->nodes[j].pnn));
2884 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2885 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2886 force_election(rec, pnn, nodemap);
2891 /* verify that we have all ip addresses we should have and we dont
2892 * have addresses we shouldnt have.
2894 if (ctdb->do_checkpublicip) {
2895 if (rec->ip_check_disable_ctx == NULL) {
2896 if (verify_ip_allocation(ctdb, pnn) != 0) {
2897 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2904 /* if we are not the recmaster then we do not need to check
2905 if recovery is needed
2907 if (pnn != rec->recmaster) {
2912 /* ensure our local copies of flags are right */
2913 ret = update_local_flags(rec, nodemap);
2914 if (ret == MONITOR_ELECTION_NEEDED) {
2915 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2916 force_election(rec, pnn, nodemap);
2919 if (ret != MONITOR_OK) {
2920 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2924 /* update the list of public ips that a node can handle for
2927 if (ctdb->num_nodes != nodemap->num) {
2928 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2929 reload_nodes_file(ctdb);
2932 for (j=0; j<nodemap->num; j++) {
2933 /* release any existing data */
2934 if (ctdb->nodes[j]->public_ips) {
2935 talloc_free(ctdb->nodes[j]->public_ips);
2936 ctdb->nodes[j]->public_ips = NULL;
2939 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2943 /* grab a new shiny list of public ips from the node */
2944 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2945 ctdb->nodes[j]->pnn,
2947 &ctdb->nodes[j]->public_ips)) {
2948 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2949 ctdb->nodes[j]->pnn));
2955 /* verify that all active nodes agree that we are the recmaster */
2956 switch (verify_recmaster(rec, nodemap, pnn)) {
2957 case MONITOR_RECOVERY_NEEDED:
2958 /* can not happen */
2960 case MONITOR_ELECTION_NEEDED:
2961 force_election(rec, pnn, nodemap);
2965 case MONITOR_FAILED:
2970 if (rec->need_recovery) {
2971 /* a previous recovery didn't finish */
2972 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2976 /* verify that all active nodes are in normal mode
2977 and not in recovery mode
2979 switch (verify_recmode(ctdb, nodemap)) {
2980 case MONITOR_RECOVERY_NEEDED:
2981 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2983 case MONITOR_FAILED:
2985 case MONITOR_ELECTION_NEEDED:
2986 /* can not happen */
2992 if (ctdb->tunable.verify_recovery_lock != 0) {
2993 /* we should have the reclock - check its not stale */
2994 ret = check_recovery_lock(ctdb);
2996 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
2997 ctdb_set_culprit(rec, ctdb->pnn);
2998 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3003 /* get the nodemap for all active remote nodes
3005 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3006 if (remote_nodemaps == NULL) {
3007 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3010 for(i=0; i<nodemap->num; i++) {
3011 remote_nodemaps[i] = NULL;
3013 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3014 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3018 /* verify that all other nodes have the same nodemap as we have
3020 for (j=0; j<nodemap->num; j++) {
3021 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3025 if (remote_nodemaps[j] == NULL) {
3026 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3027 ctdb_set_culprit(rec, j);
3032 /* if the nodes disagree on how many nodes there are
3033 then this is a good reason to try recovery
3035 if (remote_nodemaps[j]->num != nodemap->num) {
3036 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3037 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3038 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3039 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3043 /* if the nodes disagree on which nodes exist and are
3044 active, then that is also a good reason to do recovery
3046 for (i=0;i<nodemap->num;i++) {
3047 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3048 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3049 nodemap->nodes[j].pnn, i,
3050 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3051 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3052 do_recovery(rec, mem_ctx, pnn, nodemap,
3058 /* verify the flags are consistent
3060 for (i=0; i<nodemap->num; i++) {
3061 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3065 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3066 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3067 nodemap->nodes[j].pnn,
3068 nodemap->nodes[i].pnn,
3069 remote_nodemaps[j]->nodes[i].flags,
3070 nodemap->nodes[j].flags));
3072 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3073 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3074 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3075 do_recovery(rec, mem_ctx, pnn, nodemap,
3079 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3080 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3081 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3082 do_recovery(rec, mem_ctx, pnn, nodemap,
3091 /* there better be the same number of lmasters in the vnn map
3092 as there are active nodes or we will have to do a recovery
3094 if (vnnmap->size != rec->num_active) {
3095 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3096 vnnmap->size, rec->num_active));
3097 ctdb_set_culprit(rec, ctdb->pnn);
3098 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3102 /* verify that all active nodes in the nodemap also exist in
3105 for (j=0; j<nodemap->num; j++) {
3106 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3109 if (nodemap->nodes[j].pnn == pnn) {
3113 for (i=0; i<vnnmap->size; i++) {
3114 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3118 if (i == vnnmap->size) {
3119 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3120 nodemap->nodes[j].pnn));
3121 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3122 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3128 /* verify that all other nodes have the same vnnmap
3129 and are from the same generation
3131 for (j=0; j<nodemap->num; j++) {
3132 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3135 if (nodemap->nodes[j].pnn == pnn) {
3139 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3140 mem_ctx, &remote_vnnmap);
3142 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3143 nodemap->nodes[j].pnn));
3147 /* verify the vnnmap generation is the same */
3148 if (vnnmap->generation != remote_vnnmap->generation) {
3149 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3150 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3151 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3152 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3156 /* verify the vnnmap size is the same */
3157 if (vnnmap->size != remote_vnnmap->size) {
3158 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3159 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3160 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3161 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3165 /* verify the vnnmap is the same */
3166 for (i=0;i<vnnmap->size;i++) {
3167 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3168 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3169 nodemap->nodes[j].pnn));
3170 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3171 do_recovery(rec, mem_ctx, pnn, nodemap,
3178 /* we might need to change who has what IP assigned */
3179 if (rec->need_takeover_run) {
3180 rec->need_takeover_run = false;
3182 /* execute the "startrecovery" event script on all nodes */
3183 ret = run_startrecovery_eventscript(rec, nodemap);
3185 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3186 ctdb_set_culprit(rec, ctdb->pnn);
3187 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3190 ret = ctdb_takeover_run(ctdb, nodemap);
3192 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3193 ctdb_set_culprit(rec, ctdb->pnn);
3194 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3197 /* execute the "recovered" event script on all nodes */
3198 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3200 // we cant check whether the event completed successfully
3201 // since this script WILL fail if the node is in recovery mode
3202 // and if that race happens, the code here would just cause a second
3203 // cascading recovery.
3205 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3206 ctdb_set_culprit(rec, ctdb->pnn);
3207 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3218 event handler for when the main ctdbd dies
3220 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3221 uint16_t flags, void *private_data)
3223 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3228 called regularly to verify that the recovery daemon is still running
3230 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3231 struct timeval yt, void *p)
3233 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3235 if (kill(ctdb->recoverd_pid, 0) != 0) {
3236 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3238 ctdb_stop_recoverd(ctdb);
3239 ctdb_stop_keepalive(ctdb);
3240 ctdb_stop_monitoring(ctdb);
3241 ctdb_release_all_ips(ctdb);
3242 if (ctdb->methods != NULL) {
3243 ctdb->methods->shutdown(ctdb);
3245 ctdb_event_script(ctdb, "shutdown");
3250 event_add_timed(ctdb->ev, ctdb,
3251 timeval_current_ofs(30, 0),
3252 ctdb_check_recd, ctdb);
3255 static void recd_sig_child_handler(struct event_context *ev,
3256 struct signal_event *se, int signum, int count,
3260 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3265 pid = waitpid(-1, &status, WNOHANG);
3267 if (errno != ECHILD) {
3268 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3273 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3279 startup the recovery daemon as a child of the main ctdb daemon
3281 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3284 struct signal_event *se;
3286 if (pipe(fd) != 0) {
3290 ctdb->ctdbd_pid = getpid();
3292 ctdb->recoverd_pid = fork();
3293 if (ctdb->recoverd_pid == -1) {
3297 if (ctdb->recoverd_pid != 0) {
3299 event_add_timed(ctdb->ev, ctdb,
3300 timeval_current_ofs(30, 0),
3301 ctdb_check_recd, ctdb);
3307 srandom(getpid() ^ time(NULL));
3309 if (switch_from_server_to_client(ctdb) != 0) {
3310 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3314 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3315 ctdb_recoverd_parent, &fd[0]);
3317 /* set up a handler to pick up sigchld */
3318 se = event_add_signal(ctdb->ev, ctdb,
3320 recd_sig_child_handler,
3323 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3327 monitor_cluster(ctdb);
3329 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3334 shutdown the recovery daemon
3336 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3338 if (ctdb->recoverd_pid == 0) {
3342 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3343 kill(ctdb->recoverd_pid, SIGTERM);