4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb_client.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
67 struct ctdb_control_get_ifaces *ifaces;
70 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
71 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
75 ban a node for a period of time
77 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
80 struct ctdb_context *ctdb = rec->ctdb;
81 struct ctdb_ban_time bantime;
83 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
85 if (!ctdb_validate_pnn(ctdb, pnn)) {
86 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
91 bantime.time = ban_time;
93 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
95 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
101 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
105 run the "recovered" eventscript on all nodes
107 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
112 tmp_ctx = talloc_new(ctdb);
113 CTDB_NO_MEMORY(ctdb, tmp_ctx);
115 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
116 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
118 CONTROL_TIMEOUT(), false, tdb_null,
121 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
123 talloc_free(tmp_ctx);
127 talloc_free(tmp_ctx);
132 remember the trouble maker
134 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
136 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
137 struct ctdb_banning_state *ban_state;
139 if (culprit > ctdb->num_nodes) {
140 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
144 if (ctdb->nodes[culprit]->ban_state == NULL) {
145 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
146 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
150 ban_state = ctdb->nodes[culprit]->ban_state;
151 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
152 /* this was the first time in a long while this node
153 misbehaved so we will forgive any old transgressions.
155 ban_state->count = 0;
158 ban_state->count += count;
159 ban_state->last_reported_time = timeval_current();
160 rec->last_culprit_node = culprit;
164 remember the trouble maker
166 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
168 ctdb_set_culprit_count(rec, culprit, 1);
172 /* this callback is called for every node that failed to execute the
175 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
177 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
179 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
181 ctdb_set_culprit(rec, node_pnn);
185 run the "startrecovery" eventscript on all nodes
187 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
191 struct ctdb_context *ctdb = rec->ctdb;
193 tmp_ctx = talloc_new(ctdb);
194 CTDB_NO_MEMORY(ctdb, tmp_ctx);
196 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
197 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
199 CONTROL_TIMEOUT(), false, tdb_null,
201 startrecovery_fail_callback,
203 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
204 talloc_free(tmp_ctx);
208 talloc_free(tmp_ctx);
212 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
214 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
215 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
218 if (node_pnn < ctdb->num_nodes) {
219 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
224 update the node capabilities for all connected nodes
226 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
231 tmp_ctx = talloc_new(ctdb);
232 CTDB_NO_MEMORY(ctdb, tmp_ctx);
234 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
235 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
239 async_getcap_callback, NULL,
241 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
242 talloc_free(tmp_ctx);
246 talloc_free(tmp_ctx);
250 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
252 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
254 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
255 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
258 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
260 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
262 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
263 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
267 change recovery mode on all nodes
269 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
275 tmp_ctx = talloc_new(ctdb);
276 CTDB_NO_MEMORY(ctdb, tmp_ctx);
278 /* freeze all nodes */
279 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
280 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
283 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
284 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
289 set_recmode_fail_callback,
291 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
292 talloc_free(tmp_ctx);
299 data.dsize = sizeof(uint32_t);
300 data.dptr = (unsigned char *)&rec_mode;
302 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
308 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
309 talloc_free(tmp_ctx);
313 talloc_free(tmp_ctx);
318 change recovery master on all node
320 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
326 tmp_ctx = talloc_new(ctdb);
327 CTDB_NO_MEMORY(ctdb, tmp_ctx);
329 data.dsize = sizeof(uint32_t);
330 data.dptr = (unsigned char *)&pnn;
332 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
333 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
335 CONTROL_TIMEOUT(), false, data,
338 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
339 talloc_free(tmp_ctx);
343 talloc_free(tmp_ctx);
347 /* update all remote nodes to use the same db priority that we have
348 this can fail if the remove node has not yet been upgraded to
349 support this function, so we always return success and never fail
350 a recovery if this call fails.
352 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
353 struct ctdb_node_map *nodemap,
354 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
359 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
361 /* step through all local databases */
362 for (db=0; db<dbmap->num;db++) {
364 struct ctdb_db_priority db_prio;
367 db_prio.db_id = dbmap->dbs[db].dbid;
368 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
370 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
374 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
376 data.dptr = (uint8_t *)&db_prio;
377 data.dsize = sizeof(db_prio);
379 if (ctdb_client_async_control(ctdb,
380 CTDB_CONTROL_SET_DB_PRIORITY,
382 CONTROL_TIMEOUT(), false, data,
385 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
393 ensure all other nodes have attached to any databases that we have
395 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
396 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
399 struct ctdb_dbid_map *remote_dbmap;
401 /* verify that all other nodes have all our databases */
402 for (j=0; j<nodemap->num; j++) {
403 /* we dont need to ourself ourselves */
404 if (nodemap->nodes[j].pnn == pnn) {
407 /* dont check nodes that are unavailable */
408 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
412 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
413 mem_ctx, &remote_dbmap);
415 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
419 /* step through all local databases */
420 for (db=0; db<dbmap->num;db++) {
424 for (i=0;i<remote_dbmap->num;i++) {
425 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
429 /* the remote node already have this database */
430 if (i!=remote_dbmap->num) {
433 /* ok so we need to create this database */
434 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
437 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
440 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
441 mem_ctx, name, dbmap->dbs[db].persistent);
443 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
454 ensure we are attached to any databases that anyone else is attached to
456 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
457 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
460 struct ctdb_dbid_map *remote_dbmap;
462 /* verify that we have all database any other node has */
463 for (j=0; j<nodemap->num; j++) {
464 /* we dont need to ourself ourselves */
465 if (nodemap->nodes[j].pnn == pnn) {
468 /* dont check nodes that are unavailable */
469 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
473 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
474 mem_ctx, &remote_dbmap);
476 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
480 /* step through all databases on the remote node */
481 for (db=0; db<remote_dbmap->num;db++) {
484 for (i=0;i<(*dbmap)->num;i++) {
485 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
489 /* we already have this db locally */
490 if (i!=(*dbmap)->num) {
493 /* ok so we need to create this database and
496 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
497 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
499 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
500 nodemap->nodes[j].pnn));
503 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
504 remote_dbmap->dbs[db].persistent);
506 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
509 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
511 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
522 pull the remote database contents from one node into the recdb
524 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
525 struct tdb_wrap *recdb, uint32_t dbid,
530 struct ctdb_marshall_buffer *reply;
531 struct ctdb_rec_data *rec;
533 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
535 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
536 CONTROL_TIMEOUT(), &outdata);
538 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
539 talloc_free(tmp_ctx);
543 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
545 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
546 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
547 talloc_free(tmp_ctx);
551 rec = (struct ctdb_rec_data *)&reply->data[0];
555 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
557 struct ctdb_ltdb_header *hdr;
560 key.dptr = &rec->data[0];
561 key.dsize = rec->keylen;
562 data.dptr = &rec->data[key.dsize];
563 data.dsize = rec->datalen;
565 hdr = (struct ctdb_ltdb_header *)data.dptr;
567 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
568 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
569 talloc_free(tmp_ctx);
573 /* fetch the existing record, if any */
574 existing = tdb_fetch(recdb->tdb, key);
576 if (existing.dptr != NULL) {
577 struct ctdb_ltdb_header header;
578 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
579 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
580 (unsigned)existing.dsize, srcnode));
582 talloc_free(tmp_ctx);
585 header = *(struct ctdb_ltdb_header *)existing.dptr;
587 if (!(header.rsn < hdr->rsn ||
588 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
593 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
594 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
595 talloc_free(tmp_ctx);
600 talloc_free(tmp_ctx);
606 pull all the remote database contents into the recdb
608 static int pull_remote_database(struct ctdb_context *ctdb,
609 struct ctdb_recoverd *rec,
610 struct ctdb_node_map *nodemap,
611 struct tdb_wrap *recdb, uint32_t dbid,
616 /* pull all records from all other nodes across onto this node
617 (this merges based on rsn)
619 for (j=0; j<nodemap->num; j++) {
620 /* dont merge from nodes that are unavailable */
621 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
624 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
625 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
626 nodemap->nodes[j].pnn));
627 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
637 update flags on all active nodes
639 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
643 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
645 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
653 ensure all nodes have the same vnnmap we do
655 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
656 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
660 /* push the new vnn map out to all the nodes */
661 for (j=0; j<nodemap->num; j++) {
662 /* dont push to nodes that are unavailable */
663 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
667 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
669 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
679 struct vacuum_info *next, *prev;
680 struct ctdb_recoverd *rec;
682 struct ctdb_db_context *ctdb_db;
683 struct ctdb_marshall_buffer *recs;
684 struct ctdb_rec_data *r;
687 static void vacuum_fetch_next(struct vacuum_info *v);
690 called when a vacuum fetch has completed - just free it and do the next one
692 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
694 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
696 vacuum_fetch_next(v);
701 process the next element from the vacuum list
703 static void vacuum_fetch_next(struct vacuum_info *v)
705 struct ctdb_call call;
706 struct ctdb_rec_data *r;
708 while (v->recs->count) {
709 struct ctdb_client_call_state *state;
711 struct ctdb_ltdb_header *hdr;
714 call.call_id = CTDB_NULL_FUNC;
715 call.flags = CTDB_IMMEDIATE_MIGRATION;
718 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
721 call.key.dptr = &r->data[0];
722 call.key.dsize = r->keylen;
724 /* ensure we don't block this daemon - just skip a record if we can't get
726 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
730 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
731 if (data.dptr == NULL) {
732 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
736 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
738 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
742 hdr = (struct ctdb_ltdb_header *)data.dptr;
743 if (hdr->dmaster == v->rec->ctdb->pnn) {
744 /* its already local */
746 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
752 state = ctdb_call_send(v->ctdb_db, &call);
753 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
755 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
759 state->async.fn = vacuum_fetch_callback;
760 state->async.private_data = v;
769 destroy a vacuum info structure
771 static int vacuum_info_destructor(struct vacuum_info *v)
773 DLIST_REMOVE(v->rec->vacuum_info, v);
779 handler for vacuum fetch
781 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
782 TDB_DATA data, void *private_data)
784 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
785 struct ctdb_marshall_buffer *recs;
787 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
789 struct ctdb_dbid_map *dbmap=NULL;
790 bool persistent = false;
791 struct ctdb_db_context *ctdb_db;
792 struct ctdb_rec_data *r;
794 struct vacuum_info *v;
796 recs = (struct ctdb_marshall_buffer *)data.dptr;
797 r = (struct ctdb_rec_data *)&recs->data[0];
799 if (recs->count == 0) {
800 talloc_free(tmp_ctx);
806 for (v=rec->vacuum_info;v;v=v->next) {
807 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
808 /* we're already working on records from this node */
809 talloc_free(tmp_ctx);
814 /* work out if the database is persistent */
815 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
817 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
818 talloc_free(tmp_ctx);
822 for (i=0;i<dbmap->num;i++) {
823 if (dbmap->dbs[i].dbid == recs->db_id) {
824 persistent = dbmap->dbs[i].persistent;
828 if (i == dbmap->num) {
829 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
830 talloc_free(tmp_ctx);
834 /* find the name of this database */
835 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
836 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
837 talloc_free(tmp_ctx);
842 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
843 if (ctdb_db == NULL) {
844 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
845 talloc_free(tmp_ctx);
849 v = talloc_zero(rec, struct vacuum_info);
851 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
852 talloc_free(tmp_ctx);
857 v->srcnode = srcnode;
858 v->ctdb_db = ctdb_db;
859 v->recs = talloc_memdup(v, recs, data.dsize);
860 if (v->recs == NULL) {
861 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
863 talloc_free(tmp_ctx);
866 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
868 DLIST_ADD(rec->vacuum_info, v);
870 talloc_set_destructor(v, vacuum_info_destructor);
872 vacuum_fetch_next(v);
873 talloc_free(tmp_ctx);
878 called when ctdb_wait_timeout should finish
880 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
881 struct timeval yt, void *p)
883 uint32_t *timed_out = (uint32_t *)p;
888 wait for a given number of seconds
890 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
892 uint32_t timed_out = 0;
893 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
895 event_loop_once(ctdb->ev);
900 called when an election times out (ends)
902 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
903 struct timeval t, void *p)
905 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
906 rec->election_timeout = NULL;
908 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
913 wait for an election to finish. It finished election_timeout seconds after
914 the last election packet is received
916 static void ctdb_wait_election(struct ctdb_recoverd *rec)
918 struct ctdb_context *ctdb = rec->ctdb;
919 while (rec->election_timeout) {
920 event_loop_once(ctdb->ev);
925 Update our local flags from all remote connected nodes.
926 This is only run when we are or we belive we are the recovery master
928 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
931 struct ctdb_context *ctdb = rec->ctdb;
932 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
934 /* get the nodemap for all active remote nodes and verify
935 they are the same as for this node
937 for (j=0; j<nodemap->num; j++) {
938 struct ctdb_node_map *remote_nodemap=NULL;
941 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
944 if (nodemap->nodes[j].pnn == ctdb->pnn) {
948 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
949 mem_ctx, &remote_nodemap);
951 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
952 nodemap->nodes[j].pnn));
953 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
954 talloc_free(mem_ctx);
955 return MONITOR_FAILED;
957 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
958 /* We should tell our daemon about this so it
959 updates its flags or else we will log the same
960 message again in the next iteration of recovery.
961 Since we are the recovery master we can just as
962 well update the flags on all nodes.
964 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
966 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
970 /* Update our local copy of the flags in the recovery
973 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
974 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
975 nodemap->nodes[j].flags));
976 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
978 talloc_free(remote_nodemap);
980 talloc_free(mem_ctx);
985 /* Create a new random generation ip.
986 The generation id can not be the INVALID_GENERATION id
988 static uint32_t new_generation(void)
993 generation = random();
995 if (generation != INVALID_GENERATION) {
1005 create a temporary working database
1007 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1010 struct tdb_wrap *recdb;
1013 /* open up the temporary recovery database */
1014 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1015 ctdb->db_directory_state,
1022 tdb_flags = TDB_NOLOCK;
1023 if (ctdb->valgrinding) {
1024 tdb_flags |= TDB_NOMMAP;
1026 tdb_flags |= TDB_DISALLOW_NESTING;
1028 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1029 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1030 if (recdb == NULL) {
1031 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1041 a traverse function for pulling all relevent records from recdb
1044 struct ctdb_context *ctdb;
1045 struct ctdb_marshall_buffer *recdata;
1051 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1053 struct recdb_data *params = (struct recdb_data *)p;
1054 struct ctdb_rec_data *rec;
1055 struct ctdb_ltdb_header *hdr;
1057 /* skip empty records */
1058 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1062 /* update the dmaster field to point to us */
1063 hdr = (struct ctdb_ltdb_header *)data.dptr;
1064 if (!params->persistent) {
1065 hdr->dmaster = params->ctdb->pnn;
1068 /* add the record to the blob ready to send to the nodes */
1069 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1071 params->failed = true;
1074 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1075 if (params->recdata == NULL) {
1076 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1077 rec->length + params->len, params->recdata->count));
1078 params->failed = true;
1081 params->recdata->count++;
1082 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1083 params->len += rec->length;
1090 push the recdb database out to all nodes
1092 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1094 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1096 struct recdb_data params;
1097 struct ctdb_marshall_buffer *recdata;
1099 TALLOC_CTX *tmp_ctx;
1102 tmp_ctx = talloc_new(ctdb);
1103 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1105 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1106 CTDB_NO_MEMORY(ctdb, recdata);
1108 recdata->db_id = dbid;
1111 params.recdata = recdata;
1112 params.len = offsetof(struct ctdb_marshall_buffer, data);
1113 params.failed = false;
1114 params.persistent = persistent;
1116 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1117 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1118 talloc_free(params.recdata);
1119 talloc_free(tmp_ctx);
1123 if (params.failed) {
1124 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1125 talloc_free(params.recdata);
1126 talloc_free(tmp_ctx);
1130 recdata = params.recdata;
1132 outdata.dptr = (void *)recdata;
1133 outdata.dsize = params.len;
1135 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1136 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1138 CONTROL_TIMEOUT(), false, outdata,
1141 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1142 talloc_free(recdata);
1143 talloc_free(tmp_ctx);
1147 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1148 dbid, recdata->count));
1150 talloc_free(recdata);
1151 talloc_free(tmp_ctx);
1158 go through a full recovery on one database
1160 static int recover_database(struct ctdb_recoverd *rec,
1161 TALLOC_CTX *mem_ctx,
1165 struct ctdb_node_map *nodemap,
1166 uint32_t transaction_id)
1168 struct tdb_wrap *recdb;
1170 struct ctdb_context *ctdb = rec->ctdb;
1172 struct ctdb_control_wipe_database w;
1175 recdb = create_recdb(ctdb, mem_ctx);
1176 if (recdb == NULL) {
1180 /* pull all remote databases onto the recdb */
1181 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1183 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1187 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1189 /* wipe all the remote databases. This is safe as we are in a transaction */
1191 w.transaction_id = transaction_id;
1193 data.dptr = (void *)&w;
1194 data.dsize = sizeof(w);
1196 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1197 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1199 CONTROL_TIMEOUT(), false, data,
1202 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1207 /* push out the correct database. This sets the dmaster and skips
1208 the empty records */
1209 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1215 /* all done with this database */
1222 reload the nodes file
1224 static void reload_nodes_file(struct ctdb_context *ctdb)
1227 ctdb_load_nodes_file(ctdb);
1230 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1231 struct ctdb_recoverd *rec,
1232 struct ctdb_node_map *nodemap,
1238 if (ctdb->num_nodes != nodemap->num) {
1239 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1240 ctdb->num_nodes, nodemap->num));
1242 *culprit = ctdb->pnn;
1247 for (j=0; j<nodemap->num; j++) {
1248 /* release any existing data */
1249 if (ctdb->nodes[j]->known_public_ips) {
1250 talloc_free(ctdb->nodes[j]->known_public_ips);
1251 ctdb->nodes[j]->known_public_ips = NULL;
1253 if (ctdb->nodes[j]->available_public_ips) {
1254 talloc_free(ctdb->nodes[j]->available_public_ips);
1255 ctdb->nodes[j]->available_public_ips = NULL;
1258 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1262 /* grab a new shiny list of public ips from the node */
1263 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1265 ctdb->nodes[j]->pnn,
1268 &ctdb->nodes[j]->known_public_ips);
1270 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1271 ctdb->nodes[j]->pnn));
1273 *culprit = ctdb->nodes[j]->pnn;
1278 if (rec->ip_check_disable_ctx == NULL) {
1279 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1280 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1281 rec->need_takeover_run = true;
1285 /* grab a new shiny list of public ips from the node */
1286 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1288 ctdb->nodes[j]->pnn,
1290 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1291 &ctdb->nodes[j]->available_public_ips);
1293 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1294 ctdb->nodes[j]->pnn));
1296 *culprit = ctdb->nodes[j]->pnn;
1305 /* when we start a recovery, make sure all nodes use the same reclock file
1308 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1310 struct ctdb_context *ctdb = rec->ctdb;
1311 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1315 if (ctdb->recovery_lock_file == NULL) {
1319 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1320 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1323 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1324 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1330 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1331 talloc_free(tmp_ctx);
1335 talloc_free(tmp_ctx);
1341 we are the recmaster, and recovery is needed - start a recovery run
1343 static int do_recovery(struct ctdb_recoverd *rec,
1344 TALLOC_CTX *mem_ctx, uint32_t pnn,
1345 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1347 struct ctdb_context *ctdb = rec->ctdb;
1349 uint32_t generation;
1350 struct ctdb_dbid_map *dbmap;
1353 struct timeval start_time;
1354 uint32_t culprit = (uint32_t)-1;
1356 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1358 /* if recovery fails, force it again */
1359 rec->need_recovery = true;
1361 for (i=0; i<ctdb->num_nodes; i++) {
1362 struct ctdb_banning_state *ban_state;
1364 if (ctdb->nodes[i]->ban_state == NULL) {
1367 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1368 if (ban_state->count < 2*ctdb->num_nodes) {
1371 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1372 ctdb->nodes[i]->pnn, ban_state->count,
1373 ctdb->tunable.recovery_ban_period));
1374 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1375 ban_state->count = 0;
1379 if (ctdb->tunable.verify_recovery_lock != 0) {
1380 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1381 start_time = timeval_current();
1382 if (!ctdb_recovery_lock(ctdb, true)) {
1383 ctdb_set_culprit(rec, pnn);
1384 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1387 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1388 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1391 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1393 /* get a list of all databases */
1394 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1396 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1400 /* we do the db creation before we set the recovery mode, so the freeze happens
1401 on all databases we will be dealing with. */
1403 /* verify that we have all the databases any other node has */
1404 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1406 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1410 /* verify that all other nodes have all our databases */
1411 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1413 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1416 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1418 /* update the database priority for all remote databases */
1419 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1421 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1423 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1426 /* update all other nodes to use the same setting for reclock files
1427 as the local recovery master.
1429 sync_recovery_lock_file_across_cluster(rec);
1431 /* set recovery mode to active on all nodes */
1432 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1434 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1438 /* execute the "startrecovery" event script on all nodes */
1439 ret = run_startrecovery_eventscript(rec, nodemap);
1441 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1446 update all nodes to have the same flags that we have
1448 for (i=0;i<nodemap->num;i++) {
1449 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1453 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1455 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1460 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1462 /* pick a new generation number */
1463 generation = new_generation();
1465 /* change the vnnmap on this node to use the new generation
1466 number but not on any other nodes.
1467 this guarantees that if we abort the recovery prematurely
1468 for some reason (a node stops responding?)
1469 that we can just return immediately and we will reenter
1470 recovery shortly again.
1471 I.e. we deliberately leave the cluster with an inconsistent
1472 generation id to allow us to abort recovery at any stage and
1473 just restart it from scratch.
1475 vnnmap->generation = generation;
1476 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1478 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1482 data.dptr = (void *)&generation;
1483 data.dsize = sizeof(uint32_t);
1485 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1486 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1488 CONTROL_TIMEOUT(), false, data,
1490 transaction_start_fail_callback,
1492 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1493 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1495 CONTROL_TIMEOUT(), false, tdb_null,
1499 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1504 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1506 for (i=0;i<dbmap->num;i++) {
1507 ret = recover_database(rec, mem_ctx,
1509 dbmap->dbs[i].persistent,
1510 pnn, nodemap, generation);
1512 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1517 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1519 /* commit all the changes */
1520 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1522 CONTROL_TIMEOUT(), false, data,
1525 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1529 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1532 /* update the capabilities for all nodes */
1533 ret = update_capabilities(ctdb, nodemap);
1535 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1539 /* build a new vnn map with all the currently active and
1541 generation = new_generation();
1542 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1543 CTDB_NO_MEMORY(ctdb, vnnmap);
1544 vnnmap->generation = generation;
1546 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1547 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1548 for (i=j=0;i<nodemap->num;i++) {
1549 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1552 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1553 /* this node can not be an lmaster */
1554 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1559 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1560 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1561 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1564 if (vnnmap->size == 0) {
1565 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1567 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1568 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1569 vnnmap->map[0] = pnn;
1572 /* update to the new vnnmap on all nodes */
1573 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1575 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1579 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1581 /* update recmaster to point to us for all nodes */
1582 ret = set_recovery_master(ctdb, nodemap, pnn);
1584 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1588 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1591 update all nodes to have the same flags that we have
1593 for (i=0;i<nodemap->num;i++) {
1594 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1598 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1600 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1605 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1607 /* disable recovery mode */
1608 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1610 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1614 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1617 tell nodes to takeover their public IPs
1619 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1621 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1625 rec->need_takeover_run = false;
1626 ret = ctdb_takeover_run(ctdb, nodemap);
1628 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1631 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1633 /* execute the "recovered" event script on all nodes */
1634 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1636 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1640 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1642 /* send a message to all clients telling them that the cluster
1643 has been reconfigured */
1644 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1646 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1648 rec->need_recovery = false;
1650 /* we managed to complete a full recovery, make sure to forgive
1651 any past sins by the nodes that could now participate in the
1654 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1655 for (i=0;i<nodemap->num;i++) {
1656 struct ctdb_banning_state *ban_state;
1658 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1662 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1663 if (ban_state == NULL) {
1667 ban_state->count = 0;
1671 /* We just finished a recovery successfully.
1672 We now wait for rerecovery_timeout before we allow
1673 another recovery to take place.
1675 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1676 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1677 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1684 elections are won by first checking the number of connected nodes, then
1685 the priority time, then the pnn
1687 struct election_message {
1688 uint32_t num_connected;
1689 struct timeval priority_time;
1691 uint32_t node_flags;
1695 form this nodes election data
1697 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1700 struct ctdb_node_map *nodemap;
1701 struct ctdb_context *ctdb = rec->ctdb;
1705 em->pnn = rec->ctdb->pnn;
1706 em->priority_time = rec->priority_time;
1708 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1710 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1714 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1715 em->node_flags = rec->node_flags;
1717 for (i=0;i<nodemap->num;i++) {
1718 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1719 em->num_connected++;
1723 /* we shouldnt try to win this election if we cant be a recmaster */
1724 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1725 em->num_connected = 0;
1726 em->priority_time = timeval_current();
1729 talloc_free(nodemap);
1733 see if the given election data wins
1735 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1737 struct election_message myem;
1740 ctdb_election_data(rec, &myem);
1742 /* we cant win if we dont have the recmaster capability */
1743 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1747 /* we cant win if we are banned */
1748 if (rec->node_flags & NODE_FLAGS_BANNED) {
1752 /* we cant win if we are stopped */
1753 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1757 /* we will automatically win if the other node is banned */
1758 if (em->node_flags & NODE_FLAGS_BANNED) {
1762 /* we will automatically win if the other node is banned */
1763 if (em->node_flags & NODE_FLAGS_STOPPED) {
1767 /* try to use the most connected node */
1769 cmp = (int)myem.num_connected - (int)em->num_connected;
1772 /* then the longest running node */
1774 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1778 cmp = (int)myem.pnn - (int)em->pnn;
1785 send out an election request
1787 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1790 TDB_DATA election_data;
1791 struct election_message emsg;
1793 struct ctdb_context *ctdb = rec->ctdb;
1795 srvid = CTDB_SRVID_RECOVERY;
1797 ctdb_election_data(rec, &emsg);
1799 election_data.dsize = sizeof(struct election_message);
1800 election_data.dptr = (unsigned char *)&emsg;
1803 /* send an election message to all active nodes */
1804 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1805 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1808 /* A new node that is already frozen has entered the cluster.
1809 The existing nodes are not frozen and dont need to be frozen
1810 until the election has ended and we start the actual recovery
1812 if (update_recmaster == true) {
1813 /* first we assume we will win the election and set
1814 recoverymaster to be ourself on the current node
1816 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1818 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1828 this function will unban all nodes in the cluster
1830 static void unban_all_nodes(struct ctdb_context *ctdb)
1833 struct ctdb_node_map *nodemap;
1834 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1836 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1838 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1842 for (i=0;i<nodemap->num;i++) {
1843 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1844 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1845 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1849 talloc_free(tmp_ctx);
1854 we think we are winning the election - send a broadcast election request
1856 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1858 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1861 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1863 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1866 talloc_free(rec->send_election_te);
1867 rec->send_election_te = NULL;
1871 handler for memory dumps
1873 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1874 TDB_DATA data, void *private_data)
1876 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1879 struct rd_memdump_reply *rd;
1881 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1882 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1883 talloc_free(tmp_ctx);
1886 rd = (struct rd_memdump_reply *)data.dptr;
1888 dump = talloc_zero(tmp_ctx, TDB_DATA);
1890 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1891 talloc_free(tmp_ctx);
1894 ret = ctdb_dump_memory(ctdb, dump);
1896 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1897 talloc_free(tmp_ctx);
1901 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1903 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1905 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1906 talloc_free(tmp_ctx);
1910 talloc_free(tmp_ctx);
1914 handler for reload_nodes
1916 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1917 TDB_DATA data, void *private_data)
1919 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1921 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1923 reload_nodes_file(rec->ctdb);
1927 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1928 struct timeval yt, void *p)
1930 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1932 talloc_free(rec->ip_check_disable_ctx);
1933 rec->ip_check_disable_ctx = NULL;
1937 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
1938 TDB_DATA data, void *private_data)
1940 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1941 struct ctdb_public_ip *ip;
1943 if (rec->recmaster != rec->ctdb->pnn) {
1944 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
1948 if (data.dsize != sizeof(struct ctdb_public_ip)) {
1949 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
1953 ip = (struct ctdb_public_ip *)data.dptr;
1955 update_ip_assignment_tree(rec->ctdb, ip);
1959 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1960 TDB_DATA data, void *private_data)
1962 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1965 if (rec->ip_check_disable_ctx != NULL) {
1966 talloc_free(rec->ip_check_disable_ctx);
1967 rec->ip_check_disable_ctx = NULL;
1970 if (data.dsize != sizeof(uint32_t)) {
1971 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1972 "expexting %lu\n", (long unsigned)data.dsize,
1973 (long unsigned)sizeof(uint32_t)));
1976 if (data.dptr == NULL) {
1977 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1981 timeout = *((uint32_t *)data.dptr);
1982 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1984 rec->ip_check_disable_ctx = talloc_new(rec);
1985 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1987 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1992 handler for ip reallocate, just add it to the list of callers and
1993 handle this later in the monitor_cluster loop so we do not recurse
1994 with other callers to takeover_run()
1996 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1997 TDB_DATA data, void *private_data)
1999 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2000 struct ip_reallocate_list *caller;
2002 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2003 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2007 if (rec->ip_reallocate_ctx == NULL) {
2008 rec->ip_reallocate_ctx = talloc_new(rec);
2009 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2012 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2013 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2015 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2016 caller->next = rec->reallocate_callers;
2017 rec->reallocate_callers = caller;
2022 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2024 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2027 struct ip_reallocate_list *callers;
2030 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2032 /* update the list of public ips that a node can handle for
2035 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2037 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2039 rec->need_takeover_run = true;
2042 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2044 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2046 rec->need_takeover_run = true;
2050 result.dsize = sizeof(int32_t);
2051 result.dptr = (uint8_t *)&ret;
2053 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2055 /* Someone that sent srvid==0 does not want a reply */
2056 if (callers->rd->srvid == 0) {
2059 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2060 "%u:%llu\n", (unsigned)callers->rd->pnn,
2061 (unsigned long long)callers->rd->srvid));
2062 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2064 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2065 "message to %u:%llu\n",
2066 (unsigned)callers->rd->pnn,
2067 (unsigned long long)callers->rd->srvid));
2071 talloc_free(tmp_ctx);
2072 talloc_free(rec->ip_reallocate_ctx);
2073 rec->ip_reallocate_ctx = NULL;
2074 rec->reallocate_callers = NULL;
2080 handler for recovery master elections
2082 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2083 TDB_DATA data, void *private_data)
2085 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2087 struct election_message *em = (struct election_message *)data.dptr;
2088 TALLOC_CTX *mem_ctx;
2090 /* we got an election packet - update the timeout for the election */
2091 talloc_free(rec->election_timeout);
2092 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2093 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2094 ctdb_election_timeout, rec);
2096 mem_ctx = talloc_new(ctdb);
2098 /* someone called an election. check their election data
2099 and if we disagree and we would rather be the elected node,
2100 send a new election message to all other nodes
2102 if (ctdb_election_win(rec, em)) {
2103 if (!rec->send_election_te) {
2104 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2105 timeval_current_ofs(0, 500000),
2106 election_send_request, rec);
2108 talloc_free(mem_ctx);
2109 /*unban_all_nodes(ctdb);*/
2114 talloc_free(rec->send_election_te);
2115 rec->send_election_te = NULL;
2117 if (ctdb->tunable.verify_recovery_lock != 0) {
2118 /* release the recmaster lock */
2119 if (em->pnn != ctdb->pnn &&
2120 ctdb->recovery_lock_fd != -1) {
2121 close(ctdb->recovery_lock_fd);
2122 ctdb->recovery_lock_fd = -1;
2123 unban_all_nodes(ctdb);
2127 /* ok, let that guy become recmaster then */
2128 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2130 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2131 talloc_free(mem_ctx);
2135 talloc_free(mem_ctx);
2141 force the start of the election process
2143 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2144 struct ctdb_node_map *nodemap)
2147 struct ctdb_context *ctdb = rec->ctdb;
2149 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2151 /* set all nodes to recovery mode to stop all internode traffic */
2152 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2154 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2158 talloc_free(rec->election_timeout);
2159 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2160 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2161 ctdb_election_timeout, rec);
2163 ret = send_election_request(rec, pnn, true);
2165 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2169 /* wait for a few seconds to collect all responses */
2170 ctdb_wait_election(rec);
2176 handler for when a node changes its flags
2178 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2179 TDB_DATA data, void *private_data)
2182 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2183 struct ctdb_node_map *nodemap=NULL;
2184 TALLOC_CTX *tmp_ctx;
2185 uint32_t changed_flags;
2187 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2188 int disabled_flag_changed;
2190 if (data.dsize != sizeof(*c)) {
2191 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2195 tmp_ctx = talloc_new(ctdb);
2196 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2198 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2200 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2201 talloc_free(tmp_ctx);
2206 for (i=0;i<nodemap->num;i++) {
2207 if (nodemap->nodes[i].pnn == c->pnn) break;
2210 if (i == nodemap->num) {
2211 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2212 talloc_free(tmp_ctx);
2216 changed_flags = c->old_flags ^ c->new_flags;
2218 if (nodemap->nodes[i].flags != c->new_flags) {
2219 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2222 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2224 nodemap->nodes[i].flags = c->new_flags;
2226 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2227 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2230 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2231 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2235 ctdb->recovery_master == ctdb->pnn &&
2236 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2237 /* Only do the takeover run if the perm disabled or unhealthy
2238 flags changed since these will cause an ip failover but not
2240 If the node became disconnected or banned this will also
2241 lead to an ip address failover but that is handled
2244 if (disabled_flag_changed) {
2245 rec->need_takeover_run = true;
2249 talloc_free(tmp_ctx);
2253 handler for when we need to push out flag changes ot all other nodes
2255 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2256 TDB_DATA data, void *private_data)
2259 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2260 struct ctdb_node_map *nodemap=NULL;
2261 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2265 /* find the recovery master */
2266 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2268 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2269 talloc_free(tmp_ctx);
2273 /* read the node flags from the recmaster */
2274 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2276 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2277 talloc_free(tmp_ctx);
2280 if (c->pnn >= nodemap->num) {
2281 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2282 talloc_free(tmp_ctx);
2286 /* send the flags update to all connected nodes */
2287 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2289 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2290 nodes, 0, CONTROL_TIMEOUT(),
2294 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2296 talloc_free(tmp_ctx);
2300 talloc_free(tmp_ctx);
2304 struct verify_recmode_normal_data {
2306 enum monitor_result status;
2309 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2311 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2314 /* one more node has responded with recmode data*/
2317 /* if we failed to get the recmode, then return an error and let
2318 the main loop try again.
2320 if (state->state != CTDB_CONTROL_DONE) {
2321 if (rmdata->status == MONITOR_OK) {
2322 rmdata->status = MONITOR_FAILED;
2327 /* if we got a response, then the recmode will be stored in the
2330 if (state->status != CTDB_RECOVERY_NORMAL) {
2331 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2332 rmdata->status = MONITOR_RECOVERY_NEEDED;
2339 /* verify that all nodes are in normal recovery mode */
2340 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2342 struct verify_recmode_normal_data *rmdata;
2343 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2344 struct ctdb_client_control_state *state;
2345 enum monitor_result status;
2348 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2349 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2351 rmdata->status = MONITOR_OK;
2353 /* loop over all active nodes and send an async getrecmode call to
2355 for (j=0; j<nodemap->num; j++) {
2356 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2359 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2361 nodemap->nodes[j].pnn);
2362 if (state == NULL) {
2363 /* we failed to send the control, treat this as
2364 an error and try again next iteration
2366 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2367 talloc_free(mem_ctx);
2368 return MONITOR_FAILED;
2371 /* set up the callback functions */
2372 state->async.fn = verify_recmode_normal_callback;
2373 state->async.private_data = rmdata;
2375 /* one more control to wait for to complete */
2380 /* now wait for up to the maximum number of seconds allowed
2381 or until all nodes we expect a response from has replied
2383 while (rmdata->count > 0) {
2384 event_loop_once(ctdb->ev);
2387 status = rmdata->status;
2388 talloc_free(mem_ctx);
2393 struct verify_recmaster_data {
2394 struct ctdb_recoverd *rec;
2397 enum monitor_result status;
2400 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2402 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2405 /* one more node has responded with recmaster data*/
2408 /* if we failed to get the recmaster, then return an error and let
2409 the main loop try again.
2411 if (state->state != CTDB_CONTROL_DONE) {
2412 if (rmdata->status == MONITOR_OK) {
2413 rmdata->status = MONITOR_FAILED;
2418 /* if we got a response, then the recmaster will be stored in the
2421 if (state->status != rmdata->pnn) {
2422 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2423 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2424 rmdata->status = MONITOR_ELECTION_NEEDED;
2431 /* verify that all nodes agree that we are the recmaster */
2432 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2434 struct ctdb_context *ctdb = rec->ctdb;
2435 struct verify_recmaster_data *rmdata;
2436 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2437 struct ctdb_client_control_state *state;
2438 enum monitor_result status;
2441 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2442 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2446 rmdata->status = MONITOR_OK;
2448 /* loop over all active nodes and send an async getrecmaster call to
2450 for (j=0; j<nodemap->num; j++) {
2451 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2454 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2456 nodemap->nodes[j].pnn);
2457 if (state == NULL) {
2458 /* we failed to send the control, treat this as
2459 an error and try again next iteration
2461 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2462 talloc_free(mem_ctx);
2463 return MONITOR_FAILED;
2466 /* set up the callback functions */
2467 state->async.fn = verify_recmaster_callback;
2468 state->async.private_data = rmdata;
2470 /* one more control to wait for to complete */
2475 /* now wait for up to the maximum number of seconds allowed
2476 or until all nodes we expect a response from has replied
2478 while (rmdata->count > 0) {
2479 event_loop_once(ctdb->ev);
2482 status = rmdata->status;
2483 talloc_free(mem_ctx);
2488 /* called to check that the local allocation of public ip addresses is ok.
2490 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
2492 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2493 struct ctdb_control_get_ifaces *ifaces = NULL;
2494 struct ctdb_all_public_ips *ips = NULL;
2495 struct ctdb_uptime *uptime1 = NULL;
2496 struct ctdb_uptime *uptime2 = NULL;
2498 bool need_iface_check = false;
2499 bool need_takeover_run = false;
2501 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2502 CTDB_CURRENT_NODE, &uptime1);
2504 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2505 talloc_free(mem_ctx);
2510 /* read the interfaces from the local node */
2511 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2513 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2514 talloc_free(mem_ctx);
2519 need_iface_check = true;
2520 } else if (rec->ifaces->num != ifaces->num) {
2521 need_iface_check = true;
2522 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2523 need_iface_check = true;
2526 if (need_iface_check) {
2527 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2528 "local node %u - force takeover run\n",
2530 need_takeover_run = true;
2533 /* read the ip allocation from the local node */
2534 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2536 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2537 talloc_free(mem_ctx);
2541 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2542 CTDB_CURRENT_NODE, &uptime2);
2544 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2545 talloc_free(mem_ctx);
2549 /* skip the check if the startrecovery time has changed */
2550 if (timeval_compare(&uptime1->last_recovery_started,
2551 &uptime2->last_recovery_started) != 0) {
2552 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2553 talloc_free(mem_ctx);
2557 /* skip the check if the endrecovery time has changed */
2558 if (timeval_compare(&uptime1->last_recovery_finished,
2559 &uptime2->last_recovery_finished) != 0) {
2560 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2561 talloc_free(mem_ctx);
2565 /* skip the check if we have started but not finished recovery */
2566 if (timeval_compare(&uptime1->last_recovery_finished,
2567 &uptime1->last_recovery_started) != 1) {
2568 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2569 talloc_free(mem_ctx);
2574 talloc_free(rec->ifaces);
2575 rec->ifaces = talloc_steal(rec, ifaces);
2577 /* verify that we have the ip addresses we should have
2578 and we dont have ones we shouldnt have.
2579 if we find an inconsistency we set recmode to
2580 active on the local node and wait for the recmaster
2581 to do a full blown recovery
2583 for (j=0; j<ips->num; j++) {
2584 if (ips->ips[j].pnn == pnn) {
2585 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2586 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2587 ctdb_addr_to_str(&ips->ips[j].addr)));
2588 need_takeover_run = true;
2591 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2592 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2593 ctdb_addr_to_str(&ips->ips[j].addr)));
2594 need_takeover_run = true;
2599 if (need_takeover_run) {
2600 struct takeover_run_reply rd;
2603 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2607 data.dptr = (uint8_t *)&rd;
2608 data.dsize = sizeof(rd);
2610 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2612 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2615 talloc_free(mem_ctx);
2620 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2622 struct ctdb_node_map **remote_nodemaps = callback_data;
2624 if (node_pnn >= ctdb->num_nodes) {
2625 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2629 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2633 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2634 struct ctdb_node_map *nodemap,
2635 struct ctdb_node_map **remote_nodemaps)
2639 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2640 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2642 CONTROL_TIMEOUT(), false, tdb_null,
2643 async_getnodemap_callback,
2645 remote_nodemaps) != 0) {
2646 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2654 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2655 struct ctdb_check_reclock_state {
2656 struct ctdb_context *ctdb;
2657 struct timeval start_time;
2660 struct timed_event *te;
2661 struct fd_event *fde;
2662 enum reclock_child_status status;
2665 /* when we free the reclock state we must kill any child process.
2667 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2669 struct ctdb_context *ctdb = state->ctdb;
2671 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2673 if (state->fd[0] != -1) {
2674 close(state->fd[0]);
2677 if (state->fd[1] != -1) {
2678 close(state->fd[1]);
2681 kill(state->child, SIGKILL);
2686 called if our check_reclock child times out. this would happen if
2687 i/o to the reclock file blocks.
2689 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2690 struct timeval t, void *private_data)
2692 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2693 struct ctdb_check_reclock_state);
2695 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2696 state->status = RECLOCK_TIMEOUT;
2699 /* this is called when the child process has completed checking the reclock
2700 file and has written data back to us through the pipe.
2702 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2703 uint16_t flags, void *private_data)
2705 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2706 struct ctdb_check_reclock_state);
2710 /* we got a response from our child process so we can abort the
2713 talloc_free(state->te);
2716 ret = read(state->fd[0], &c, 1);
2717 if (ret != 1 || c != RECLOCK_OK) {
2718 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2719 state->status = RECLOCK_FAILED;
2724 state->status = RECLOCK_OK;
2728 static int check_recovery_lock(struct ctdb_context *ctdb)
2731 struct ctdb_check_reclock_state *state;
2732 pid_t parent = getpid();
2734 if (ctdb->recovery_lock_fd == -1) {
2735 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2739 state = talloc(ctdb, struct ctdb_check_reclock_state);
2740 CTDB_NO_MEMORY(ctdb, state);
2743 state->start_time = timeval_current();
2744 state->status = RECLOCK_CHECKING;
2748 ret = pipe(state->fd);
2751 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2755 state->child = fork();
2756 if (state->child == (pid_t)-1) {
2757 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2758 close(state->fd[0]);
2760 close(state->fd[1]);
2766 if (state->child == 0) {
2767 char cc = RECLOCK_OK;
2768 close(state->fd[0]);
2771 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2772 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2773 cc = RECLOCK_FAILED;
2776 write(state->fd[1], &cc, 1);
2777 /* make sure we die when our parent dies */
2778 while (kill(parent, 0) == 0 || errno != ESRCH) {
2780 write(state->fd[1], &cc, 1);
2784 close(state->fd[1]);
2786 set_close_on_exec(state->fd[0]);
2788 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2790 talloc_set_destructor(state, check_reclock_destructor);
2792 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2793 ctdb_check_reclock_timeout, state);
2794 if (state->te == NULL) {
2795 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2800 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2801 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2802 reclock_child_handler,
2805 if (state->fde == NULL) {
2806 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2811 while (state->status == RECLOCK_CHECKING) {
2812 event_loop_once(ctdb->ev);
2815 if (state->status == RECLOCK_FAILED) {
2816 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2817 close(ctdb->recovery_lock_fd);
2818 ctdb->recovery_lock_fd = -1;
2827 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2829 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2830 const char *reclockfile;
2832 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2833 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2834 talloc_free(tmp_ctx);
2838 if (reclockfile == NULL) {
2839 if (ctdb->recovery_lock_file != NULL) {
2840 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2841 talloc_free(ctdb->recovery_lock_file);
2842 ctdb->recovery_lock_file = NULL;
2843 if (ctdb->recovery_lock_fd != -1) {
2844 close(ctdb->recovery_lock_fd);
2845 ctdb->recovery_lock_fd = -1;
2848 ctdb->tunable.verify_recovery_lock = 0;
2849 talloc_free(tmp_ctx);
2853 if (ctdb->recovery_lock_file == NULL) {
2854 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2855 if (ctdb->recovery_lock_fd != -1) {
2856 close(ctdb->recovery_lock_fd);
2857 ctdb->recovery_lock_fd = -1;
2859 talloc_free(tmp_ctx);
2864 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2865 talloc_free(tmp_ctx);
2869 talloc_free(ctdb->recovery_lock_file);
2870 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2871 ctdb->tunable.verify_recovery_lock = 0;
2872 if (ctdb->recovery_lock_fd != -1) {
2873 close(ctdb->recovery_lock_fd);
2874 ctdb->recovery_lock_fd = -1;
2877 talloc_free(tmp_ctx);
2881 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2882 TALLOC_CTX *mem_ctx)
2885 struct ctdb_node_map *nodemap=NULL;
2886 struct ctdb_node_map *recmaster_nodemap=NULL;
2887 struct ctdb_node_map **remote_nodemaps=NULL;
2888 struct ctdb_vnn_map *vnnmap=NULL;
2889 struct ctdb_vnn_map *remote_vnnmap=NULL;
2890 int32_t debug_level;
2895 /* verify that the main daemon is still running */
2896 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2897 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2901 /* ping the local daemon to tell it we are alive */
2902 ctdb_ctrl_recd_ping(ctdb);
2904 if (rec->election_timeout) {
2905 /* an election is in progress */
2909 /* read the debug level from the parent and update locally */
2910 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2912 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2915 LogLevel = debug_level;
2918 /* We must check if we need to ban a node here but we want to do this
2919 as early as possible so we dont wait until we have pulled the node
2920 map from the local node. thats why we have the hardcoded value 20
2922 for (i=0; i<ctdb->num_nodes; i++) {
2923 struct ctdb_banning_state *ban_state;
2925 if (ctdb->nodes[i]->ban_state == NULL) {
2928 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2929 if (ban_state->count < 20) {
2932 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2933 ctdb->nodes[i]->pnn, ban_state->count,
2934 ctdb->tunable.recovery_ban_period));
2935 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2936 ban_state->count = 0;
2939 /* get relevant tunables */
2940 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2942 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2946 /* get the current recovery lock file from the server */
2947 if (update_recovery_lock_file(ctdb) != 0) {
2948 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2952 /* Make sure that if recovery lock verification becomes disabled when
2955 if (ctdb->tunable.verify_recovery_lock == 0) {
2956 if (ctdb->recovery_lock_fd != -1) {
2957 close(ctdb->recovery_lock_fd);
2958 ctdb->recovery_lock_fd = -1;
2962 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2963 if (pnn == (uint32_t)-1) {
2964 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2968 /* get the vnnmap */
2969 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2971 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2976 /* get number of nodes */
2978 talloc_free(rec->nodemap);
2979 rec->nodemap = NULL;
2982 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2984 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2987 nodemap = rec->nodemap;
2989 /* check which node is the recovery master */
2990 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2992 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2996 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2997 if (rec->recmaster != pnn) {
2998 if (rec->ip_reallocate_ctx != NULL) {
2999 talloc_free(rec->ip_reallocate_ctx);
3000 rec->ip_reallocate_ctx = NULL;
3001 rec->reallocate_callers = NULL;
3004 /* if there are takeovers requested, perform it and notify the waiters */
3005 if (rec->reallocate_callers) {
3006 process_ipreallocate_requests(ctdb, rec);
3009 if (rec->recmaster == (uint32_t)-1) {
3010 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3011 force_election(rec, pnn, nodemap);
3016 /* if the local daemon is STOPPED, we verify that the databases are
3017 also frozen and thet the recmode is set to active
3019 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3020 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3022 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3024 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3025 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3027 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3029 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3032 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3034 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3041 /* If the local node is stopped, verify we are not the recmaster
3042 and yield this role if so
3044 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
3045 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
3046 force_election(rec, pnn, nodemap);
3050 /* check that we (recovery daemon) and the local ctdb daemon
3051 agrees on whether we are banned or not
3055 /* remember our own node flags */
3056 rec->node_flags = nodemap->nodes[pnn].flags;
3058 /* count how many active nodes there are */
3059 rec->num_active = 0;
3060 rec->num_connected = 0;
3061 for (i=0; i<nodemap->num; i++) {
3062 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3065 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3066 rec->num_connected++;
3071 /* verify that the recmaster node is still active */
3072 for (j=0; j<nodemap->num; j++) {
3073 if (nodemap->nodes[j].pnn==rec->recmaster) {
3078 if (j == nodemap->num) {
3079 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3080 force_election(rec, pnn, nodemap);
3084 /* if recovery master is disconnected we must elect a new recmaster */
3085 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3086 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3087 force_election(rec, pnn, nodemap);
3091 /* grap the nodemap from the recovery master to check if it is banned */
3092 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3093 mem_ctx, &recmaster_nodemap);
3095 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3096 nodemap->nodes[j].pnn));
3101 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3102 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3103 force_election(rec, pnn, nodemap);
3108 /* verify that we have all ip addresses we should have and we dont
3109 * have addresses we shouldnt have.
3111 if (ctdb->do_checkpublicip) {
3112 if (rec->ip_check_disable_ctx == NULL) {
3113 if (verify_local_ip_allocation(ctdb, rec, pnn) != 0) {
3114 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3120 /* if we are not the recmaster then we do not need to check
3121 if recovery is needed
3123 if (pnn != rec->recmaster) {
3128 /* ensure our local copies of flags are right */
3129 ret = update_local_flags(rec, nodemap);
3130 if (ret == MONITOR_ELECTION_NEEDED) {
3131 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3132 force_election(rec, pnn, nodemap);
3135 if (ret != MONITOR_OK) {
3136 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3140 if (ctdb->num_nodes != nodemap->num) {
3141 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3142 reload_nodes_file(ctdb);
3146 /* verify that all active nodes agree that we are the recmaster */
3147 switch (verify_recmaster(rec, nodemap, pnn)) {
3148 case MONITOR_RECOVERY_NEEDED:
3149 /* can not happen */
3151 case MONITOR_ELECTION_NEEDED:
3152 force_election(rec, pnn, nodemap);
3156 case MONITOR_FAILED:
3161 if (rec->need_recovery) {
3162 /* a previous recovery didn't finish */
3163 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3167 /* verify that all active nodes are in normal mode
3168 and not in recovery mode
3170 switch (verify_recmode(ctdb, nodemap)) {
3171 case MONITOR_RECOVERY_NEEDED:
3172 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3174 case MONITOR_FAILED:
3176 case MONITOR_ELECTION_NEEDED:
3177 /* can not happen */
3183 if (ctdb->tunable.verify_recovery_lock != 0) {
3184 /* we should have the reclock - check its not stale */
3185 ret = check_recovery_lock(ctdb);
3187 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3188 ctdb_set_culprit(rec, ctdb->pnn);
3189 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3194 /* get the nodemap for all active remote nodes
3196 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3197 if (remote_nodemaps == NULL) {
3198 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3201 for(i=0; i<nodemap->num; i++) {
3202 remote_nodemaps[i] = NULL;
3204 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3205 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3209 /* verify that all other nodes have the same nodemap as we have
3211 for (j=0; j<nodemap->num; j++) {
3212 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3216 if (remote_nodemaps[j] == NULL) {
3217 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3218 ctdb_set_culprit(rec, j);
3223 /* if the nodes disagree on how many nodes there are
3224 then this is a good reason to try recovery
3226 if (remote_nodemaps[j]->num != nodemap->num) {
3227 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3228 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3229 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3230 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3234 /* if the nodes disagree on which nodes exist and are
3235 active, then that is also a good reason to do recovery
3237 for (i=0;i<nodemap->num;i++) {
3238 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3239 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3240 nodemap->nodes[j].pnn, i,
3241 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3242 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3243 do_recovery(rec, mem_ctx, pnn, nodemap,
3249 /* verify the flags are consistent
3251 for (i=0; i<nodemap->num; i++) {
3252 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3256 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3257 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3258 nodemap->nodes[j].pnn,
3259 nodemap->nodes[i].pnn,
3260 remote_nodemaps[j]->nodes[i].flags,
3261 nodemap->nodes[j].flags));
3263 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3264 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3265 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3266 do_recovery(rec, mem_ctx, pnn, nodemap,
3270 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3271 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3272 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3273 do_recovery(rec, mem_ctx, pnn, nodemap,
3282 /* there better be the same number of lmasters in the vnn map
3283 as there are active nodes or we will have to do a recovery
3285 if (vnnmap->size != rec->num_active) {
3286 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3287 vnnmap->size, rec->num_active));
3288 ctdb_set_culprit(rec, ctdb->pnn);
3289 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3293 /* verify that all active nodes in the nodemap also exist in
3296 for (j=0; j<nodemap->num; j++) {
3297 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3300 if (nodemap->nodes[j].pnn == pnn) {
3304 for (i=0; i<vnnmap->size; i++) {
3305 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3309 if (i == vnnmap->size) {
3310 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3311 nodemap->nodes[j].pnn));
3312 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3313 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3319 /* verify that all other nodes have the same vnnmap
3320 and are from the same generation
3322 for (j=0; j<nodemap->num; j++) {
3323 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3326 if (nodemap->nodes[j].pnn == pnn) {
3330 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3331 mem_ctx, &remote_vnnmap);
3333 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3334 nodemap->nodes[j].pnn));
3338 /* verify the vnnmap generation is the same */
3339 if (vnnmap->generation != remote_vnnmap->generation) {
3340 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3341 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3342 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3343 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3347 /* verify the vnnmap size is the same */
3348 if (vnnmap->size != remote_vnnmap->size) {
3349 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3350 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3351 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3352 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3356 /* verify the vnnmap is the same */
3357 for (i=0;i<vnnmap->size;i++) {
3358 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3359 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3360 nodemap->nodes[j].pnn));
3361 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3362 do_recovery(rec, mem_ctx, pnn, nodemap,
3369 /* we might need to change who has what IP assigned */
3370 if (rec->need_takeover_run) {
3371 uint32_t culprit = (uint32_t)-1;
3373 rec->need_takeover_run = false;
3375 /* update the list of public ips that a node can handle for
3378 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3380 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3382 ctdb_set_culprit(rec, culprit);
3383 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3387 /* execute the "startrecovery" event script on all nodes */
3388 ret = run_startrecovery_eventscript(rec, nodemap);
3390 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3391 ctdb_set_culprit(rec, ctdb->pnn);
3392 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3396 ret = ctdb_takeover_run(ctdb, nodemap);
3398 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3399 ctdb_set_culprit(rec, ctdb->pnn);
3400 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3404 /* execute the "recovered" event script on all nodes */
3405 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3407 // we cant check whether the event completed successfully
3408 // since this script WILL fail if the node is in recovery mode
3409 // and if that race happens, the code here would just cause a second
3410 // cascading recovery.
3412 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3413 ctdb_set_culprit(rec, ctdb->pnn);
3414 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3421 the main monitoring loop
3423 static void monitor_cluster(struct ctdb_context *ctdb)
3425 struct ctdb_recoverd *rec;
3427 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3429 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3430 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3434 rec->priority_time = timeval_current();
3436 /* register a message port for sending memory dumps */
3437 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3439 /* register a message port for recovery elections */
3440 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3442 /* when nodes are disabled/enabled */
3443 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3445 /* when we are asked to puch out a flag change */
3446 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3448 /* register a message port for vacuum fetch */
3449 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3451 /* register a message port for reloadnodes */
3452 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3454 /* register a message port for performing a takeover run */
3455 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3457 /* register a message port for disabling the ip check for a short while */
3458 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3460 /* register a message port for updating the recovery daemons node assignment for an ip */
3461 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3464 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3466 DEBUG(DEBUG_CRIT,(__location__
3467 " Failed to create temp context\n"));
3471 main_loop(ctdb, rec, mem_ctx);
3472 talloc_free(mem_ctx);
3474 /* we only check for recovery once every second */
3475 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
3480 event handler for when the main ctdbd dies
3482 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3483 uint16_t flags, void *private_data)
3485 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3490 called regularly to verify that the recovery daemon is still running
3492 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3493 struct timeval yt, void *p)
3495 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3497 if (kill(ctdb->recoverd_pid, 0) != 0) {
3498 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3500 ctdb_stop_recoverd(ctdb);
3501 ctdb_stop_keepalive(ctdb);
3502 ctdb_stop_monitoring(ctdb);
3503 ctdb_release_all_ips(ctdb);
3504 if (ctdb->methods != NULL) {
3505 ctdb->methods->shutdown(ctdb);
3507 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
3512 event_add_timed(ctdb->ev, ctdb,
3513 timeval_current_ofs(30, 0),
3514 ctdb_check_recd, ctdb);
3517 static void recd_sig_child_handler(struct event_context *ev,
3518 struct signal_event *se, int signum, int count,
3522 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3527 pid = waitpid(-1, &status, WNOHANG);
3529 if (errno != ECHILD) {
3530 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3535 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3541 startup the recovery daemon as a child of the main ctdb daemon
3543 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3546 struct signal_event *se;
3548 if (pipe(fd) != 0) {
3552 ctdb->ctdbd_pid = getpid();
3554 ctdb->recoverd_pid = fork();
3555 if (ctdb->recoverd_pid == -1) {
3559 if (ctdb->recoverd_pid != 0) {
3561 event_add_timed(ctdb->ev, ctdb,
3562 timeval_current_ofs(30, 0),
3563 ctdb_check_recd, ctdb);
3569 srandom(getpid() ^ time(NULL));
3571 if (switch_from_server_to_client(ctdb) != 0) {
3572 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3576 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3578 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3579 ctdb_recoverd_parent, &fd[0]);
3581 /* set up a handler to pick up sigchld */
3582 se = event_add_signal(ctdb->ev, ctdb,
3584 recd_sig_child_handler,
3587 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3591 monitor_cluster(ctdb);
3593 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3598 shutdown the recovery daemon
3600 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3602 if (ctdb->recoverd_pid == 0) {
3606 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3607 kill(ctdb->recoverd_pid, SIGTERM);