4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* list of "ctdb ipreallocate" processes to call back when we have
34 finished the takeover run.
36 struct ip_reallocate_list {
37 struct ip_reallocate_list *next;
38 struct rd_memdump_reply *rd;
41 struct ctdb_banning_state {
43 struct timeval last_reported_time;
47 private state of recovery daemon
49 struct ctdb_recoverd {
50 struct ctdb_context *ctdb;
53 uint32_t num_connected;
54 uint32_t last_culprit_node;
55 struct ctdb_node_map *nodemap;
56 struct timeval priority_time;
57 bool need_takeover_run;
60 struct timed_event *send_election_te;
61 struct timed_event *election_timeout;
62 struct vacuum_info *vacuum_info;
63 TALLOC_CTX *ip_reallocate_ctx;
64 struct ip_reallocate_list *reallocate_callers;
65 TALLOC_CTX *ip_check_disable_ctx;
66 struct ctdb_control_get_ifaces *ifaces;
67 TALLOC_CTX *deferred_rebalance_ctx;
70 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
71 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
73 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
76 ban a node for a period of time
78 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
81 struct ctdb_context *ctdb = rec->ctdb;
82 struct ctdb_ban_time bantime;
84 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
86 if (!ctdb_validate_pnn(ctdb, pnn)) {
87 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
92 bantime.time = ban_time;
94 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
96 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
102 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
106 run the "recovered" eventscript on all nodes
108 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
113 tmp_ctx = talloc_new(ctdb);
114 CTDB_NO_MEMORY(ctdb, tmp_ctx);
116 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
117 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
119 CONTROL_TIMEOUT(), false, tdb_null,
122 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
124 talloc_free(tmp_ctx);
128 talloc_free(tmp_ctx);
133 remember the trouble maker
135 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
137 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
138 struct ctdb_banning_state *ban_state;
140 if (culprit > ctdb->num_nodes) {
141 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
145 if (ctdb->nodes[culprit]->ban_state == NULL) {
146 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
147 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
151 ban_state = ctdb->nodes[culprit]->ban_state;
152 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
153 /* this was the first time in a long while this node
154 misbehaved so we will forgive any old transgressions.
156 ban_state->count = 0;
159 ban_state->count += count;
160 ban_state->last_reported_time = timeval_current();
161 rec->last_culprit_node = culprit;
165 remember the trouble maker
167 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
169 ctdb_set_culprit_count(rec, culprit, 1);
173 /* this callback is called for every node that failed to execute the
176 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
178 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
180 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
182 ctdb_set_culprit(rec, node_pnn);
186 run the "startrecovery" eventscript on all nodes
188 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
192 struct ctdb_context *ctdb = rec->ctdb;
194 tmp_ctx = talloc_new(ctdb);
195 CTDB_NO_MEMORY(ctdb, tmp_ctx);
197 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
198 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
200 CONTROL_TIMEOUT(), false, tdb_null,
202 startrecovery_fail_callback,
204 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
205 talloc_free(tmp_ctx);
209 talloc_free(tmp_ctx);
213 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
215 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
216 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
219 if (node_pnn < ctdb->num_nodes) {
220 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 if (node_pnn == ctdb->pnn) {
224 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
229 update the node capabilities for all connected nodes
231 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
236 tmp_ctx = talloc_new(ctdb);
237 CTDB_NO_MEMORY(ctdb, tmp_ctx);
239 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
240 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
244 async_getcap_callback, NULL,
246 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
247 talloc_free(tmp_ctx);
251 talloc_free(tmp_ctx);
255 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
257 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
259 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
260 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
263 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
265 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
267 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
268 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
272 change recovery mode on all nodes
274 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
280 tmp_ctx = talloc_new(ctdb);
281 CTDB_NO_MEMORY(ctdb, tmp_ctx);
283 /* freeze all nodes */
284 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
285 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
288 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
289 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
294 set_recmode_fail_callback,
296 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
297 talloc_free(tmp_ctx);
304 data.dsize = sizeof(uint32_t);
305 data.dptr = (unsigned char *)&rec_mode;
307 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
313 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
314 talloc_free(tmp_ctx);
318 talloc_free(tmp_ctx);
323 change recovery master on all node
325 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
331 tmp_ctx = talloc_new(ctdb);
332 CTDB_NO_MEMORY(ctdb, tmp_ctx);
334 data.dsize = sizeof(uint32_t);
335 data.dptr = (unsigned char *)&pnn;
337 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
338 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
340 CONTROL_TIMEOUT(), false, data,
343 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
344 talloc_free(tmp_ctx);
348 talloc_free(tmp_ctx);
352 /* update all remote nodes to use the same db priority that we have
353 this can fail if the remove node has not yet been upgraded to
354 support this function, so we always return success and never fail
355 a recovery if this call fails.
357 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
358 struct ctdb_node_map *nodemap,
359 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
364 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
366 /* step through all local databases */
367 for (db=0; db<dbmap->num;db++) {
369 struct ctdb_db_priority db_prio;
372 db_prio.db_id = dbmap->dbs[db].dbid;
373 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
375 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
379 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
381 data.dptr = (uint8_t *)&db_prio;
382 data.dsize = sizeof(db_prio);
384 if (ctdb_client_async_control(ctdb,
385 CTDB_CONTROL_SET_DB_PRIORITY,
387 CONTROL_TIMEOUT(), false, data,
390 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
398 ensure all other nodes have attached to any databases that we have
400 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
401 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
404 struct ctdb_dbid_map *remote_dbmap;
406 /* verify that all other nodes have all our databases */
407 for (j=0; j<nodemap->num; j++) {
408 /* we dont need to ourself ourselves */
409 if (nodemap->nodes[j].pnn == pnn) {
412 /* dont check nodes that are unavailable */
413 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
417 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
418 mem_ctx, &remote_dbmap);
420 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
424 /* step through all local databases */
425 for (db=0; db<dbmap->num;db++) {
429 for (i=0;i<remote_dbmap->num;i++) {
430 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
434 /* the remote node already have this database */
435 if (i!=remote_dbmap->num) {
438 /* ok so we need to create this database */
439 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
442 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
445 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
447 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
449 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
460 ensure we are attached to any databases that anyone else is attached to
462 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
463 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
466 struct ctdb_dbid_map *remote_dbmap;
468 /* verify that we have all database any other node has */
469 for (j=0; j<nodemap->num; j++) {
470 /* we dont need to ourself ourselves */
471 if (nodemap->nodes[j].pnn == pnn) {
474 /* dont check nodes that are unavailable */
475 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
479 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
480 mem_ctx, &remote_dbmap);
482 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
486 /* step through all databases on the remote node */
487 for (db=0; db<remote_dbmap->num;db++) {
490 for (i=0;i<(*dbmap)->num;i++) {
491 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
495 /* we already have this db locally */
496 if (i!=(*dbmap)->num) {
499 /* ok so we need to create this database and
502 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
503 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
506 nodemap->nodes[j].pnn));
509 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
510 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
512 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
515 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
517 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
528 pull the remote database contents from one node into the recdb
530 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
531 struct tdb_wrap *recdb, uint32_t dbid)
535 struct ctdb_marshall_buffer *reply;
536 struct ctdb_rec_data *rec;
538 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
540 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
541 CONTROL_TIMEOUT(), &outdata);
543 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
544 talloc_free(tmp_ctx);
548 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
550 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
551 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
552 talloc_free(tmp_ctx);
556 rec = (struct ctdb_rec_data *)&reply->data[0];
560 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
562 struct ctdb_ltdb_header *hdr;
565 key.dptr = &rec->data[0];
566 key.dsize = rec->keylen;
567 data.dptr = &rec->data[key.dsize];
568 data.dsize = rec->datalen;
570 hdr = (struct ctdb_ltdb_header *)data.dptr;
572 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
573 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
574 talloc_free(tmp_ctx);
578 /* fetch the existing record, if any */
579 existing = tdb_fetch(recdb->tdb, key);
581 if (existing.dptr != NULL) {
582 struct ctdb_ltdb_header header;
583 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
584 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
585 (unsigned)existing.dsize, srcnode));
587 talloc_free(tmp_ctx);
590 header = *(struct ctdb_ltdb_header *)existing.dptr;
592 if (!(header.rsn < hdr->rsn ||
593 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
598 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
599 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
600 talloc_free(tmp_ctx);
605 talloc_free(tmp_ctx);
611 struct pull_seqnum_cbdata {
617 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
619 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
622 if (cb_data->failed != 0) {
623 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
628 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
633 if (outdata.dsize != sizeof(uint64_t)) {
634 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
635 cb_data->failed = -1;
639 seqnum = *((uint64_t *)outdata.dptr);
641 if (seqnum > cb_data->seqnum) {
642 cb_data->seqnum = seqnum;
643 cb_data->pnn = node_pnn;
647 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
649 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
651 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
655 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
656 struct ctdb_recoverd *rec,
657 struct ctdb_node_map *nodemap,
658 struct tdb_wrap *recdb, uint32_t dbid)
660 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
664 struct pull_seqnum_cbdata *cb_data;
666 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
671 data.dsize = sizeof(outdata);
672 data.dptr = (uint8_t *)&outdata[0];
674 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
675 if (cb_data == NULL) {
676 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
677 talloc_free(tmp_ctx);
685 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
686 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
688 CONTROL_TIMEOUT(), false, data,
692 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
694 talloc_free(tmp_ctx);
698 if (cb_data->failed != 0) {
699 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
700 talloc_free(tmp_ctx);
704 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
705 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
706 talloc_free(tmp_ctx);
710 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
712 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
713 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
714 talloc_free(tmp_ctx);
718 talloc_free(tmp_ctx);
724 pull all the remote database contents into the recdb
726 static int pull_remote_database(struct ctdb_context *ctdb,
727 struct ctdb_recoverd *rec,
728 struct ctdb_node_map *nodemap,
729 struct tdb_wrap *recdb, uint32_t dbid,
734 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
736 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
742 /* pull all records from all other nodes across onto this node
743 (this merges based on rsn)
745 for (j=0; j<nodemap->num; j++) {
746 /* dont merge from nodes that are unavailable */
747 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
750 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
751 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
752 nodemap->nodes[j].pnn));
753 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
763 update flags on all active nodes
765 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
769 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
771 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
779 ensure all nodes have the same vnnmap we do
781 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
782 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
786 /* push the new vnn map out to all the nodes */
787 for (j=0; j<nodemap->num; j++) {
788 /* dont push to nodes that are unavailable */
789 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
793 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
795 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
805 struct vacuum_info *next, *prev;
806 struct ctdb_recoverd *rec;
808 struct ctdb_db_context *ctdb_db;
809 struct ctdb_marshall_buffer *recs;
810 struct ctdb_rec_data *r;
813 static void vacuum_fetch_next(struct vacuum_info *v);
816 called when a vacuum fetch has completed - just free it and do the next one
818 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
820 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
822 vacuum_fetch_next(v);
827 process the next element from the vacuum list
829 static void vacuum_fetch_next(struct vacuum_info *v)
831 struct ctdb_call call;
832 struct ctdb_rec_data *r;
834 while (v->recs->count) {
835 struct ctdb_client_call_state *state;
837 struct ctdb_ltdb_header *hdr;
840 call.call_id = CTDB_NULL_FUNC;
841 call.flags = CTDB_IMMEDIATE_MIGRATION;
842 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
845 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
848 call.key.dptr = &r->data[0];
849 call.key.dsize = r->keylen;
851 /* ensure we don't block this daemon - just skip a record if we can't get
853 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
857 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
858 if (data.dptr == NULL) {
859 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
863 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
865 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
869 hdr = (struct ctdb_ltdb_header *)data.dptr;
870 if (hdr->dmaster == v->rec->ctdb->pnn) {
871 /* its already local */
873 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
879 state = ctdb_call_send(v->ctdb_db, &call);
880 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
882 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
886 state->async.fn = vacuum_fetch_callback;
887 state->async.private_data = v;
896 destroy a vacuum info structure
898 static int vacuum_info_destructor(struct vacuum_info *v)
900 DLIST_REMOVE(v->rec->vacuum_info, v);
906 handler for vacuum fetch
908 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
909 TDB_DATA data, void *private_data)
911 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
912 struct ctdb_marshall_buffer *recs;
914 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
916 struct ctdb_dbid_map *dbmap=NULL;
917 bool persistent = false;
918 struct ctdb_db_context *ctdb_db;
919 struct ctdb_rec_data *r;
921 struct vacuum_info *v;
923 recs = (struct ctdb_marshall_buffer *)data.dptr;
924 r = (struct ctdb_rec_data *)&recs->data[0];
926 if (recs->count == 0) {
927 talloc_free(tmp_ctx);
933 for (v=rec->vacuum_info;v;v=v->next) {
934 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
935 /* we're already working on records from this node */
936 talloc_free(tmp_ctx);
941 /* work out if the database is persistent */
942 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
944 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
945 talloc_free(tmp_ctx);
949 for (i=0;i<dbmap->num;i++) {
950 if (dbmap->dbs[i].dbid == recs->db_id) {
951 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
955 if (i == dbmap->num) {
956 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
957 talloc_free(tmp_ctx);
961 /* find the name of this database */
962 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
963 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
964 talloc_free(tmp_ctx);
969 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
970 if (ctdb_db == NULL) {
971 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
972 talloc_free(tmp_ctx);
976 v = talloc_zero(rec, struct vacuum_info);
978 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
979 talloc_free(tmp_ctx);
984 v->srcnode = srcnode;
985 v->ctdb_db = ctdb_db;
986 v->recs = talloc_memdup(v, recs, data.dsize);
987 if (v->recs == NULL) {
988 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
990 talloc_free(tmp_ctx);
993 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
995 DLIST_ADD(rec->vacuum_info, v);
997 talloc_set_destructor(v, vacuum_info_destructor);
999 vacuum_fetch_next(v);
1000 talloc_free(tmp_ctx);
1005 called when ctdb_wait_timeout should finish
1007 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1008 struct timeval yt, void *p)
1010 uint32_t *timed_out = (uint32_t *)p;
1015 wait for a given number of seconds
1017 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1019 uint32_t timed_out = 0;
1020 time_t usecs = (secs - (time_t)secs) * 1000000;
1021 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1022 while (!timed_out) {
1023 event_loop_once(ctdb->ev);
1028 called when an election times out (ends)
1030 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1031 struct timeval t, void *p)
1033 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1034 rec->election_timeout = NULL;
1037 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1042 wait for an election to finish. It finished election_timeout seconds after
1043 the last election packet is received
1045 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1047 struct ctdb_context *ctdb = rec->ctdb;
1048 while (rec->election_timeout) {
1049 event_loop_once(ctdb->ev);
1054 Update our local flags from all remote connected nodes.
1055 This is only run when we are or we belive we are the recovery master
1057 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1060 struct ctdb_context *ctdb = rec->ctdb;
1061 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1063 /* get the nodemap for all active remote nodes and verify
1064 they are the same as for this node
1066 for (j=0; j<nodemap->num; j++) {
1067 struct ctdb_node_map *remote_nodemap=NULL;
1070 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1073 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1077 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1078 mem_ctx, &remote_nodemap);
1080 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1081 nodemap->nodes[j].pnn));
1082 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1083 talloc_free(mem_ctx);
1084 return MONITOR_FAILED;
1086 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1087 /* We should tell our daemon about this so it
1088 updates its flags or else we will log the same
1089 message again in the next iteration of recovery.
1090 Since we are the recovery master we can just as
1091 well update the flags on all nodes.
1093 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1095 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1099 /* Update our local copy of the flags in the recovery
1102 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1103 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1104 nodemap->nodes[j].flags));
1105 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1107 talloc_free(remote_nodemap);
1109 talloc_free(mem_ctx);
1114 /* Create a new random generation ip.
1115 The generation id can not be the INVALID_GENERATION id
1117 static uint32_t new_generation(void)
1119 uint32_t generation;
1122 generation = random();
1124 if (generation != INVALID_GENERATION) {
1134 create a temporary working database
1136 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1139 struct tdb_wrap *recdb;
1142 /* open up the temporary recovery database */
1143 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1144 ctdb->db_directory_state,
1151 tdb_flags = TDB_NOLOCK;
1152 if (ctdb->valgrinding) {
1153 tdb_flags |= TDB_NOMMAP;
1155 tdb_flags |= TDB_DISALLOW_NESTING;
1157 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1158 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1159 if (recdb == NULL) {
1160 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1170 a traverse function for pulling all relevent records from recdb
1173 struct ctdb_context *ctdb;
1174 struct ctdb_marshall_buffer *recdata;
1180 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1182 struct recdb_data *params = (struct recdb_data *)p;
1183 struct ctdb_rec_data *rec;
1184 struct ctdb_ltdb_header *hdr;
1186 /* skip empty records */
1187 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1191 /* update the dmaster field to point to us */
1192 hdr = (struct ctdb_ltdb_header *)data.dptr;
1193 if (!params->persistent) {
1194 hdr->dmaster = params->ctdb->pnn;
1195 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1198 /* add the record to the blob ready to send to the nodes */
1199 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1201 params->failed = true;
1204 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1205 if (params->recdata == NULL) {
1206 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1207 rec->length + params->len, params->recdata->count));
1208 params->failed = true;
1211 params->recdata->count++;
1212 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1213 params->len += rec->length;
1220 push the recdb database out to all nodes
1222 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1224 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1226 struct recdb_data params;
1227 struct ctdb_marshall_buffer *recdata;
1229 TALLOC_CTX *tmp_ctx;
1232 tmp_ctx = talloc_new(ctdb);
1233 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1235 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1236 CTDB_NO_MEMORY(ctdb, recdata);
1238 recdata->db_id = dbid;
1241 params.recdata = recdata;
1242 params.len = offsetof(struct ctdb_marshall_buffer, data);
1243 params.failed = false;
1244 params.persistent = persistent;
1246 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1247 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1248 talloc_free(params.recdata);
1249 talloc_free(tmp_ctx);
1253 if (params.failed) {
1254 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1255 talloc_free(params.recdata);
1256 talloc_free(tmp_ctx);
1260 recdata = params.recdata;
1262 outdata.dptr = (void *)recdata;
1263 outdata.dsize = params.len;
1265 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1266 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1268 CONTROL_TIMEOUT(), false, outdata,
1271 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1272 talloc_free(recdata);
1273 talloc_free(tmp_ctx);
1277 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1278 dbid, recdata->count));
1280 talloc_free(recdata);
1281 talloc_free(tmp_ctx);
1288 go through a full recovery on one database
1290 static int recover_database(struct ctdb_recoverd *rec,
1291 TALLOC_CTX *mem_ctx,
1295 struct ctdb_node_map *nodemap,
1296 uint32_t transaction_id)
1298 struct tdb_wrap *recdb;
1300 struct ctdb_context *ctdb = rec->ctdb;
1302 struct ctdb_control_wipe_database w;
1305 recdb = create_recdb(ctdb, mem_ctx);
1306 if (recdb == NULL) {
1310 /* pull all remote databases onto the recdb */
1311 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1313 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1317 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1319 /* wipe all the remote databases. This is safe as we are in a transaction */
1321 w.transaction_id = transaction_id;
1323 data.dptr = (void *)&w;
1324 data.dsize = sizeof(w);
1326 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1327 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1329 CONTROL_TIMEOUT(), false, data,
1332 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1337 /* push out the correct database. This sets the dmaster and skips
1338 the empty records */
1339 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1345 /* all done with this database */
1352 reload the nodes file
1354 static void reload_nodes_file(struct ctdb_context *ctdb)
1357 ctdb_load_nodes_file(ctdb);
1360 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1361 struct ctdb_recoverd *rec,
1362 struct ctdb_node_map *nodemap,
1368 if (ctdb->num_nodes != nodemap->num) {
1369 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1370 ctdb->num_nodes, nodemap->num));
1372 *culprit = ctdb->pnn;
1377 for (j=0; j<nodemap->num; j++) {
1378 /* release any existing data */
1379 if (ctdb->nodes[j]->known_public_ips) {
1380 talloc_free(ctdb->nodes[j]->known_public_ips);
1381 ctdb->nodes[j]->known_public_ips = NULL;
1383 if (ctdb->nodes[j]->available_public_ips) {
1384 talloc_free(ctdb->nodes[j]->available_public_ips);
1385 ctdb->nodes[j]->available_public_ips = NULL;
1388 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1392 /* grab a new shiny list of public ips from the node */
1393 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1395 ctdb->nodes[j]->pnn,
1398 &ctdb->nodes[j]->known_public_ips);
1400 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1401 ctdb->nodes[j]->pnn));
1403 *culprit = ctdb->nodes[j]->pnn;
1408 if (ctdb->do_checkpublicip) {
1409 if (rec->ip_check_disable_ctx == NULL) {
1410 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1411 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1412 rec->need_takeover_run = true;
1417 /* grab a new shiny list of public ips from the node */
1418 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1420 ctdb->nodes[j]->pnn,
1422 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1423 &ctdb->nodes[j]->available_public_ips);
1425 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1426 ctdb->nodes[j]->pnn));
1428 *culprit = ctdb->nodes[j]->pnn;
1437 /* when we start a recovery, make sure all nodes use the same reclock file
1440 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1442 struct ctdb_context *ctdb = rec->ctdb;
1443 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1447 if (ctdb->recovery_lock_file == NULL) {
1451 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1452 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1455 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1456 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1462 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1463 talloc_free(tmp_ctx);
1467 talloc_free(tmp_ctx);
1473 we are the recmaster, and recovery is needed - start a recovery run
1475 static int do_recovery(struct ctdb_recoverd *rec,
1476 TALLOC_CTX *mem_ctx, uint32_t pnn,
1477 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1479 struct ctdb_context *ctdb = rec->ctdb;
1481 uint32_t generation;
1482 struct ctdb_dbid_map *dbmap;
1485 struct timeval start_time;
1486 uint32_t culprit = (uint32_t)-1;
1488 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1490 /* if recovery fails, force it again */
1491 rec->need_recovery = true;
1493 for (i=0; i<ctdb->num_nodes; i++) {
1494 struct ctdb_banning_state *ban_state;
1496 if (ctdb->nodes[i]->ban_state == NULL) {
1499 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1500 if (ban_state->count < 2*ctdb->num_nodes) {
1503 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1504 ctdb->nodes[i]->pnn, ban_state->count,
1505 ctdb->tunable.recovery_ban_period));
1506 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1507 ban_state->count = 0;
1511 if (ctdb->tunable.verify_recovery_lock != 0) {
1512 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1513 start_time = timeval_current();
1514 if (!ctdb_recovery_lock(ctdb, true)) {
1515 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1516 "and ban ourself for %u seconds\n",
1517 ctdb->tunable.recovery_ban_period));
1518 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1521 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1522 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1525 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1527 /* get a list of all databases */
1528 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1530 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1534 /* we do the db creation before we set the recovery mode, so the freeze happens
1535 on all databases we will be dealing with. */
1537 /* verify that we have all the databases any other node has */
1538 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1540 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1544 /* verify that all other nodes have all our databases */
1545 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1547 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1550 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1552 /* update the database priority for all remote databases */
1553 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1555 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1557 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1560 /* update all other nodes to use the same setting for reclock files
1561 as the local recovery master.
1563 sync_recovery_lock_file_across_cluster(rec);
1565 /* set recovery mode to active on all nodes */
1566 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1568 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1572 /* execute the "startrecovery" event script on all nodes */
1573 ret = run_startrecovery_eventscript(rec, nodemap);
1575 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1580 update all nodes to have the same flags that we have
1582 for (i=0;i<nodemap->num;i++) {
1583 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1587 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1589 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1594 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1596 /* pick a new generation number */
1597 generation = new_generation();
1599 /* change the vnnmap on this node to use the new generation
1600 number but not on any other nodes.
1601 this guarantees that if we abort the recovery prematurely
1602 for some reason (a node stops responding?)
1603 that we can just return immediately and we will reenter
1604 recovery shortly again.
1605 I.e. we deliberately leave the cluster with an inconsistent
1606 generation id to allow us to abort recovery at any stage and
1607 just restart it from scratch.
1609 vnnmap->generation = generation;
1610 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1612 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1616 data.dptr = (void *)&generation;
1617 data.dsize = sizeof(uint32_t);
1619 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1620 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1622 CONTROL_TIMEOUT(), false, data,
1624 transaction_start_fail_callback,
1626 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1627 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1629 CONTROL_TIMEOUT(), false, tdb_null,
1633 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1638 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1640 for (i=0;i<dbmap->num;i++) {
1641 ret = recover_database(rec, mem_ctx,
1643 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1644 pnn, nodemap, generation);
1646 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1651 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1653 /* commit all the changes */
1654 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1656 CONTROL_TIMEOUT(), false, data,
1659 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1663 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1666 /* update the capabilities for all nodes */
1667 ret = update_capabilities(ctdb, nodemap);
1669 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1673 /* build a new vnn map with all the currently active and
1675 generation = new_generation();
1676 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1677 CTDB_NO_MEMORY(ctdb, vnnmap);
1678 vnnmap->generation = generation;
1680 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1681 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1682 for (i=j=0;i<nodemap->num;i++) {
1683 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1686 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1687 /* this node can not be an lmaster */
1688 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1693 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1694 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1695 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1698 if (vnnmap->size == 0) {
1699 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1701 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1702 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1703 vnnmap->map[0] = pnn;
1706 /* update to the new vnnmap on all nodes */
1707 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1709 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1713 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1715 /* update recmaster to point to us for all nodes */
1716 ret = set_recovery_master(ctdb, nodemap, pnn);
1718 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1722 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1725 update all nodes to have the same flags that we have
1727 for (i=0;i<nodemap->num;i++) {
1728 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1732 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1734 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1739 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1741 /* disable recovery mode */
1742 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1744 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1748 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1751 tell nodes to takeover their public IPs
1753 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1755 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1757 rec->need_takeover_run = true;
1760 rec->need_takeover_run = false;
1761 ret = ctdb_takeover_run(ctdb, nodemap);
1763 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1764 rec->need_takeover_run = true;
1767 /* execute the "recovered" event script on all nodes */
1768 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1770 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1774 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1776 /* send a message to all clients telling them that the cluster
1777 has been reconfigured */
1778 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1780 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1782 rec->need_recovery = false;
1784 /* we managed to complete a full recovery, make sure to forgive
1785 any past sins by the nodes that could now participate in the
1788 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1789 for (i=0;i<nodemap->num;i++) {
1790 struct ctdb_banning_state *ban_state;
1792 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1796 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1797 if (ban_state == NULL) {
1801 ban_state->count = 0;
1805 /* We just finished a recovery successfully.
1806 We now wait for rerecovery_timeout before we allow
1807 another recovery to take place.
1809 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1810 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1811 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1818 elections are won by first checking the number of connected nodes, then
1819 the priority time, then the pnn
1821 struct election_message {
1822 uint32_t num_connected;
1823 struct timeval priority_time;
1825 uint32_t node_flags;
1829 form this nodes election data
1831 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1834 struct ctdb_node_map *nodemap;
1835 struct ctdb_context *ctdb = rec->ctdb;
1839 em->pnn = rec->ctdb->pnn;
1840 em->priority_time = rec->priority_time;
1842 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1844 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1848 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1849 em->node_flags = rec->node_flags;
1851 for (i=0;i<nodemap->num;i++) {
1852 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1853 em->num_connected++;
1857 /* we shouldnt try to win this election if we cant be a recmaster */
1858 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1859 em->num_connected = 0;
1860 em->priority_time = timeval_current();
1863 talloc_free(nodemap);
1867 see if the given election data wins
1869 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1871 struct election_message myem;
1874 ctdb_election_data(rec, &myem);
1876 /* we cant win if we dont have the recmaster capability */
1877 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1881 /* we cant win if we are banned */
1882 if (rec->node_flags & NODE_FLAGS_BANNED) {
1886 /* we cant win if we are stopped */
1887 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1891 /* we will automatically win if the other node is banned */
1892 if (em->node_flags & NODE_FLAGS_BANNED) {
1896 /* we will automatically win if the other node is banned */
1897 if (em->node_flags & NODE_FLAGS_STOPPED) {
1901 /* try to use the most connected node */
1903 cmp = (int)myem.num_connected - (int)em->num_connected;
1906 /* then the longest running node */
1908 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1912 cmp = (int)myem.pnn - (int)em->pnn;
1919 send out an election request
1921 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1924 TDB_DATA election_data;
1925 struct election_message emsg;
1927 struct ctdb_context *ctdb = rec->ctdb;
1929 srvid = CTDB_SRVID_RECOVERY;
1931 ctdb_election_data(rec, &emsg);
1933 election_data.dsize = sizeof(struct election_message);
1934 election_data.dptr = (unsigned char *)&emsg;
1937 /* send an election message to all active nodes */
1938 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1939 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1942 /* A new node that is already frozen has entered the cluster.
1943 The existing nodes are not frozen and dont need to be frozen
1944 until the election has ended and we start the actual recovery
1946 if (update_recmaster == true) {
1947 /* first we assume we will win the election and set
1948 recoverymaster to be ourself on the current node
1950 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1952 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1962 this function will unban all nodes in the cluster
1964 static void unban_all_nodes(struct ctdb_context *ctdb)
1967 struct ctdb_node_map *nodemap;
1968 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1970 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1972 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1976 for (i=0;i<nodemap->num;i++) {
1977 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1978 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1979 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1983 talloc_free(tmp_ctx);
1988 we think we are winning the election - send a broadcast election request
1990 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1992 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1995 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1997 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2000 talloc_free(rec->send_election_te);
2001 rec->send_election_te = NULL;
2005 handler for memory dumps
2007 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2008 TDB_DATA data, void *private_data)
2010 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2013 struct rd_memdump_reply *rd;
2015 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2016 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2017 talloc_free(tmp_ctx);
2020 rd = (struct rd_memdump_reply *)data.dptr;
2022 dump = talloc_zero(tmp_ctx, TDB_DATA);
2024 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2025 talloc_free(tmp_ctx);
2028 ret = ctdb_dump_memory(ctdb, dump);
2030 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2031 talloc_free(tmp_ctx);
2035 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2037 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2039 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2040 talloc_free(tmp_ctx);
2044 talloc_free(tmp_ctx);
2048 handler for reload_nodes
2050 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2051 TDB_DATA data, void *private_data)
2053 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2055 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2057 reload_nodes_file(rec->ctdb);
2061 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2062 struct timeval yt, void *p)
2064 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2066 talloc_free(rec->ip_check_disable_ctx);
2067 rec->ip_check_disable_ctx = NULL;
2071 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2072 struct timeval t, void *p)
2074 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2075 struct ctdb_context *ctdb = rec->ctdb;
2078 DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
2080 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2082 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
2083 rec->need_takeover_run = true;
2086 talloc_free(rec->deferred_rebalance_ctx);
2087 rec->deferred_rebalance_ctx = NULL;
2091 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2092 TDB_DATA data, void *private_data)
2095 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2097 if (data.dsize != sizeof(uint32_t)) {
2098 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2102 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2106 pnn = *(uint32_t *)&data.dptr[0];
2108 lcp2_forcerebalance(ctdb, pnn);
2109 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2111 if (rec->deferred_rebalance_ctx != NULL) {
2112 talloc_free(rec->deferred_rebalance_ctx);
2114 rec->deferred_rebalance_ctx = talloc_new(rec);
2115 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2116 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2117 ctdb_rebalance_timeout, rec);
2122 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2123 TDB_DATA data, void *private_data)
2125 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2126 struct ctdb_public_ip *ip;
2128 if (rec->recmaster != rec->ctdb->pnn) {
2129 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2133 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2134 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2138 ip = (struct ctdb_public_ip *)data.dptr;
2140 update_ip_assignment_tree(rec->ctdb, ip);
2144 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2145 TDB_DATA data, void *private_data)
2147 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2150 if (rec->ip_check_disable_ctx != NULL) {
2151 talloc_free(rec->ip_check_disable_ctx);
2152 rec->ip_check_disable_ctx = NULL;
2155 if (data.dsize != sizeof(uint32_t)) {
2156 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2157 "expexting %lu\n", (long unsigned)data.dsize,
2158 (long unsigned)sizeof(uint32_t)));
2161 if (data.dptr == NULL) {
2162 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2166 timeout = *((uint32_t *)data.dptr);
2167 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2169 rec->ip_check_disable_ctx = talloc_new(rec);
2170 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2172 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2177 handler for ip reallocate, just add it to the list of callers and
2178 handle this later in the monitor_cluster loop so we do not recurse
2179 with other callers to takeover_run()
2181 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2182 TDB_DATA data, void *private_data)
2184 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2185 struct ip_reallocate_list *caller;
2187 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2188 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2192 if (rec->ip_reallocate_ctx == NULL) {
2193 rec->ip_reallocate_ctx = talloc_new(rec);
2194 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2197 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2198 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2200 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2201 caller->next = rec->reallocate_callers;
2202 rec->reallocate_callers = caller;
2207 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2209 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2212 struct ip_reallocate_list *callers;
2215 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2217 /* update the list of public ips that a node can handle for
2220 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2222 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2224 rec->need_takeover_run = true;
2227 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2229 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2230 rec->need_takeover_run = true;
2234 result.dsize = sizeof(int32_t);
2235 result.dptr = (uint8_t *)&ret;
2237 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2239 /* Someone that sent srvid==0 does not want a reply */
2240 if (callers->rd->srvid == 0) {
2243 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2244 "%u:%llu\n", (unsigned)callers->rd->pnn,
2245 (unsigned long long)callers->rd->srvid));
2246 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2248 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2249 "message to %u:%llu\n",
2250 (unsigned)callers->rd->pnn,
2251 (unsigned long long)callers->rd->srvid));
2255 talloc_free(tmp_ctx);
2256 talloc_free(rec->ip_reallocate_ctx);
2257 rec->ip_reallocate_ctx = NULL;
2258 rec->reallocate_callers = NULL;
2264 handler for recovery master elections
2266 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2267 TDB_DATA data, void *private_data)
2269 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2271 struct election_message *em = (struct election_message *)data.dptr;
2272 TALLOC_CTX *mem_ctx;
2274 /* we got an election packet - update the timeout for the election */
2275 talloc_free(rec->election_timeout);
2276 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2278 timeval_current_ofs(0, 500000) :
2279 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2280 ctdb_election_timeout, rec);
2282 mem_ctx = talloc_new(ctdb);
2284 /* someone called an election. check their election data
2285 and if we disagree and we would rather be the elected node,
2286 send a new election message to all other nodes
2288 if (ctdb_election_win(rec, em)) {
2289 if (!rec->send_election_te) {
2290 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2291 timeval_current_ofs(0, 500000),
2292 election_send_request, rec);
2294 talloc_free(mem_ctx);
2295 /*unban_all_nodes(ctdb);*/
2300 talloc_free(rec->send_election_te);
2301 rec->send_election_te = NULL;
2303 if (ctdb->tunable.verify_recovery_lock != 0) {
2304 /* release the recmaster lock */
2305 if (em->pnn != ctdb->pnn &&
2306 ctdb->recovery_lock_fd != -1) {
2307 close(ctdb->recovery_lock_fd);
2308 ctdb->recovery_lock_fd = -1;
2309 unban_all_nodes(ctdb);
2313 /* ok, let that guy become recmaster then */
2314 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2316 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2317 talloc_free(mem_ctx);
2321 talloc_free(mem_ctx);
2327 force the start of the election process
2329 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2330 struct ctdb_node_map *nodemap)
2333 struct ctdb_context *ctdb = rec->ctdb;
2335 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2337 /* set all nodes to recovery mode to stop all internode traffic */
2338 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2340 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2344 talloc_free(rec->election_timeout);
2345 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2347 timeval_current_ofs(0, 500000) :
2348 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2349 ctdb_election_timeout, rec);
2351 ret = send_election_request(rec, pnn, true);
2353 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2357 /* wait for a few seconds to collect all responses */
2358 ctdb_wait_election(rec);
2364 handler for when a node changes its flags
2366 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2367 TDB_DATA data, void *private_data)
2370 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2371 struct ctdb_node_map *nodemap=NULL;
2372 TALLOC_CTX *tmp_ctx;
2374 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2375 int disabled_flag_changed;
2377 if (data.dsize != sizeof(*c)) {
2378 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2382 tmp_ctx = talloc_new(ctdb);
2383 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2385 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2387 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2388 talloc_free(tmp_ctx);
2393 for (i=0;i<nodemap->num;i++) {
2394 if (nodemap->nodes[i].pnn == c->pnn) break;
2397 if (i == nodemap->num) {
2398 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2399 talloc_free(tmp_ctx);
2403 if (nodemap->nodes[i].flags != c->new_flags) {
2404 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2407 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2409 nodemap->nodes[i].flags = c->new_flags;
2411 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2412 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2415 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2416 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2420 ctdb->recovery_master == ctdb->pnn &&
2421 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2422 /* Only do the takeover run if the perm disabled or unhealthy
2423 flags changed since these will cause an ip failover but not
2425 If the node became disconnected or banned this will also
2426 lead to an ip address failover but that is handled
2429 if (disabled_flag_changed) {
2430 rec->need_takeover_run = true;
2434 talloc_free(tmp_ctx);
2438 handler for when we need to push out flag changes ot all other nodes
2440 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2441 TDB_DATA data, void *private_data)
2444 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2445 struct ctdb_node_map *nodemap=NULL;
2446 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2450 /* find the recovery master */
2451 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2453 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2454 talloc_free(tmp_ctx);
2458 /* read the node flags from the recmaster */
2459 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2461 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2462 talloc_free(tmp_ctx);
2465 if (c->pnn >= nodemap->num) {
2466 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2467 talloc_free(tmp_ctx);
2471 /* send the flags update to all connected nodes */
2472 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2474 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2475 nodes, 0, CONTROL_TIMEOUT(),
2479 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2481 talloc_free(tmp_ctx);
2485 talloc_free(tmp_ctx);
2489 struct verify_recmode_normal_data {
2491 enum monitor_result status;
2494 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2496 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2499 /* one more node has responded with recmode data*/
2502 /* if we failed to get the recmode, then return an error and let
2503 the main loop try again.
2505 if (state->state != CTDB_CONTROL_DONE) {
2506 if (rmdata->status == MONITOR_OK) {
2507 rmdata->status = MONITOR_FAILED;
2512 /* if we got a response, then the recmode will be stored in the
2515 if (state->status != CTDB_RECOVERY_NORMAL) {
2516 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2517 rmdata->status = MONITOR_RECOVERY_NEEDED;
2524 /* verify that all nodes are in normal recovery mode */
2525 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2527 struct verify_recmode_normal_data *rmdata;
2528 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2529 struct ctdb_client_control_state *state;
2530 enum monitor_result status;
2533 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2534 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2536 rmdata->status = MONITOR_OK;
2538 /* loop over all active nodes and send an async getrecmode call to
2540 for (j=0; j<nodemap->num; j++) {
2541 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2544 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2546 nodemap->nodes[j].pnn);
2547 if (state == NULL) {
2548 /* we failed to send the control, treat this as
2549 an error and try again next iteration
2551 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2552 talloc_free(mem_ctx);
2553 return MONITOR_FAILED;
2556 /* set up the callback functions */
2557 state->async.fn = verify_recmode_normal_callback;
2558 state->async.private_data = rmdata;
2560 /* one more control to wait for to complete */
2565 /* now wait for up to the maximum number of seconds allowed
2566 or until all nodes we expect a response from has replied
2568 while (rmdata->count > 0) {
2569 event_loop_once(ctdb->ev);
2572 status = rmdata->status;
2573 talloc_free(mem_ctx);
2578 struct verify_recmaster_data {
2579 struct ctdb_recoverd *rec;
2582 enum monitor_result status;
2585 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2587 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2590 /* one more node has responded with recmaster data*/
2593 /* if we failed to get the recmaster, then return an error and let
2594 the main loop try again.
2596 if (state->state != CTDB_CONTROL_DONE) {
2597 if (rmdata->status == MONITOR_OK) {
2598 rmdata->status = MONITOR_FAILED;
2603 /* if we got a response, then the recmaster will be stored in the
2606 if (state->status != rmdata->pnn) {
2607 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2608 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2609 rmdata->status = MONITOR_ELECTION_NEEDED;
2616 /* verify that all nodes agree that we are the recmaster */
2617 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2619 struct ctdb_context *ctdb = rec->ctdb;
2620 struct verify_recmaster_data *rmdata;
2621 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2622 struct ctdb_client_control_state *state;
2623 enum monitor_result status;
2626 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2627 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2631 rmdata->status = MONITOR_OK;
2633 /* loop over all active nodes and send an async getrecmaster call to
2635 for (j=0; j<nodemap->num; j++) {
2636 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2639 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2641 nodemap->nodes[j].pnn);
2642 if (state == NULL) {
2643 /* we failed to send the control, treat this as
2644 an error and try again next iteration
2646 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2647 talloc_free(mem_ctx);
2648 return MONITOR_FAILED;
2651 /* set up the callback functions */
2652 state->async.fn = verify_recmaster_callback;
2653 state->async.private_data = rmdata;
2655 /* one more control to wait for to complete */
2660 /* now wait for up to the maximum number of seconds allowed
2661 or until all nodes we expect a response from has replied
2663 while (rmdata->count > 0) {
2664 event_loop_once(ctdb->ev);
2667 status = rmdata->status;
2668 talloc_free(mem_ctx);
2673 /* called to check that the local allocation of public ip addresses is ok.
2675 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2677 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2678 struct ctdb_control_get_ifaces *ifaces = NULL;
2679 struct ctdb_all_public_ips *ips = NULL;
2680 struct ctdb_uptime *uptime1 = NULL;
2681 struct ctdb_uptime *uptime2 = NULL;
2683 bool need_iface_check = false;
2684 bool need_takeover_run = false;
2686 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2687 CTDB_CURRENT_NODE, &uptime1);
2689 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2690 talloc_free(mem_ctx);
2695 /* read the interfaces from the local node */
2696 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2698 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2699 talloc_free(mem_ctx);
2704 need_iface_check = true;
2705 } else if (rec->ifaces->num != ifaces->num) {
2706 need_iface_check = true;
2707 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2708 need_iface_check = true;
2711 if (need_iface_check) {
2712 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2713 "local node %u - force takeover run\n",
2715 need_takeover_run = true;
2718 /* read the ip allocation from the local node */
2719 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2721 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2722 talloc_free(mem_ctx);
2726 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2727 CTDB_CURRENT_NODE, &uptime2);
2729 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2730 talloc_free(mem_ctx);
2734 /* skip the check if the startrecovery time has changed */
2735 if (timeval_compare(&uptime1->last_recovery_started,
2736 &uptime2->last_recovery_started) != 0) {
2737 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2738 talloc_free(mem_ctx);
2742 /* skip the check if the endrecovery time has changed */
2743 if (timeval_compare(&uptime1->last_recovery_finished,
2744 &uptime2->last_recovery_finished) != 0) {
2745 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2746 talloc_free(mem_ctx);
2750 /* skip the check if we have started but not finished recovery */
2751 if (timeval_compare(&uptime1->last_recovery_finished,
2752 &uptime1->last_recovery_started) != 1) {
2753 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2754 talloc_free(mem_ctx);
2759 talloc_free(rec->ifaces);
2760 rec->ifaces = talloc_steal(rec, ifaces);
2762 /* verify that we have the ip addresses we should have
2763 and we dont have ones we shouldnt have.
2764 if we find an inconsistency we set recmode to
2765 active on the local node and wait for the recmaster
2766 to do a full blown recovery.
2767 also if the pnn is -1 and we are healthy and can host the ip
2768 we also request a ip reallocation.
2770 if (ctdb->tunable.disable_ip_failover == 0) {
2771 for (j=0; j<ips->num; j++) {
2772 if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
2773 DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
2774 ctdb_addr_to_str(&ips->ips[j].addr)));
2775 need_takeover_run = true;
2776 } else if (ips->ips[j].pnn == pnn) {
2777 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
2778 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2779 ctdb_addr_to_str(&ips->ips[j].addr)));
2780 need_takeover_run = true;
2783 if (ctdb->do_checkpublicip && ctdb_sys_have_ip(&ips->ips[j].addr)) {
2784 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2785 ctdb_addr_to_str(&ips->ips[j].addr)));
2786 need_takeover_run = true;
2792 if (need_takeover_run) {
2793 struct takeover_run_reply rd;
2796 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2800 data.dptr = (uint8_t *)&rd;
2801 data.dsize = sizeof(rd);
2803 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2805 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2808 talloc_free(mem_ctx);
2813 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2815 struct ctdb_node_map **remote_nodemaps = callback_data;
2817 if (node_pnn >= ctdb->num_nodes) {
2818 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2822 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2826 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2827 struct ctdb_node_map *nodemap,
2828 struct ctdb_node_map **remote_nodemaps)
2832 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2833 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2835 CONTROL_TIMEOUT(), false, tdb_null,
2836 async_getnodemap_callback,
2838 remote_nodemaps) != 0) {
2839 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2847 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2848 struct ctdb_check_reclock_state {
2849 struct ctdb_context *ctdb;
2850 struct timeval start_time;
2853 struct timed_event *te;
2854 struct fd_event *fde;
2855 enum reclock_child_status status;
2858 /* when we free the reclock state we must kill any child process.
2860 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2862 struct ctdb_context *ctdb = state->ctdb;
2864 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2866 if (state->fd[0] != -1) {
2867 close(state->fd[0]);
2870 if (state->fd[1] != -1) {
2871 close(state->fd[1]);
2874 kill(state->child, SIGKILL);
2879 called if our check_reclock child times out. this would happen if
2880 i/o to the reclock file blocks.
2882 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2883 struct timeval t, void *private_data)
2885 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2886 struct ctdb_check_reclock_state);
2888 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2889 state->status = RECLOCK_TIMEOUT;
2892 /* this is called when the child process has completed checking the reclock
2893 file and has written data back to us through the pipe.
2895 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2896 uint16_t flags, void *private_data)
2898 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2899 struct ctdb_check_reclock_state);
2903 /* we got a response from our child process so we can abort the
2906 talloc_free(state->te);
2909 ret = read(state->fd[0], &c, 1);
2910 if (ret != 1 || c != RECLOCK_OK) {
2911 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2912 state->status = RECLOCK_FAILED;
2917 state->status = RECLOCK_OK;
2921 static int check_recovery_lock(struct ctdb_context *ctdb)
2924 struct ctdb_check_reclock_state *state;
2925 pid_t parent = getpid();
2927 if (ctdb->recovery_lock_fd == -1) {
2928 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2932 state = talloc(ctdb, struct ctdb_check_reclock_state);
2933 CTDB_NO_MEMORY(ctdb, state);
2936 state->start_time = timeval_current();
2937 state->status = RECLOCK_CHECKING;
2941 ret = pipe(state->fd);
2944 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2948 state->child = ctdb_fork(ctdb);
2949 if (state->child == (pid_t)-1) {
2950 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2951 close(state->fd[0]);
2953 close(state->fd[1]);
2959 if (state->child == 0) {
2960 char cc = RECLOCK_OK;
2961 close(state->fd[0]);
2964 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
2965 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2966 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2967 cc = RECLOCK_FAILED;
2970 write(state->fd[1], &cc, 1);
2971 /* make sure we die when our parent dies */
2972 while (kill(parent, 0) == 0 || errno != ESRCH) {
2974 write(state->fd[1], &cc, 1);
2978 close(state->fd[1]);
2980 set_close_on_exec(state->fd[0]);
2982 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2984 talloc_set_destructor(state, check_reclock_destructor);
2986 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2987 ctdb_check_reclock_timeout, state);
2988 if (state->te == NULL) {
2989 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2994 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2996 reclock_child_handler,
2999 if (state->fde == NULL) {
3000 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3004 tevent_fd_set_auto_close(state->fde);
3006 while (state->status == RECLOCK_CHECKING) {
3007 event_loop_once(ctdb->ev);
3010 if (state->status == RECLOCK_FAILED) {
3011 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3012 close(ctdb->recovery_lock_fd);
3013 ctdb->recovery_lock_fd = -1;
3022 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3024 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3025 const char *reclockfile;
3027 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3028 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3029 talloc_free(tmp_ctx);
3033 if (reclockfile == NULL) {
3034 if (ctdb->recovery_lock_file != NULL) {
3035 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3036 talloc_free(ctdb->recovery_lock_file);
3037 ctdb->recovery_lock_file = NULL;
3038 if (ctdb->recovery_lock_fd != -1) {
3039 close(ctdb->recovery_lock_fd);
3040 ctdb->recovery_lock_fd = -1;
3043 ctdb->tunable.verify_recovery_lock = 0;
3044 talloc_free(tmp_ctx);
3048 if (ctdb->recovery_lock_file == NULL) {
3049 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3050 if (ctdb->recovery_lock_fd != -1) {
3051 close(ctdb->recovery_lock_fd);
3052 ctdb->recovery_lock_fd = -1;
3054 talloc_free(tmp_ctx);
3059 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3060 talloc_free(tmp_ctx);
3064 talloc_free(ctdb->recovery_lock_file);
3065 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3066 ctdb->tunable.verify_recovery_lock = 0;
3067 if (ctdb->recovery_lock_fd != -1) {
3068 close(ctdb->recovery_lock_fd);
3069 ctdb->recovery_lock_fd = -1;
3072 talloc_free(tmp_ctx);
3076 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3077 TALLOC_CTX *mem_ctx)
3080 struct ctdb_node_map *nodemap=NULL;
3081 struct ctdb_node_map *recmaster_nodemap=NULL;
3082 struct ctdb_node_map **remote_nodemaps=NULL;
3083 struct ctdb_vnn_map *vnnmap=NULL;
3084 struct ctdb_vnn_map *remote_vnnmap=NULL;
3085 int32_t debug_level;
3090 /* verify that the main daemon is still running */
3091 if (kill(ctdb->ctdbd_pid, 0) != 0) {
3092 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3096 /* ping the local daemon to tell it we are alive */
3097 ctdb_ctrl_recd_ping(ctdb);
3099 if (rec->election_timeout) {
3100 /* an election is in progress */
3104 /* read the debug level from the parent and update locally */
3105 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3107 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3110 LogLevel = debug_level;
3113 /* We must check if we need to ban a node here but we want to do this
3114 as early as possible so we dont wait until we have pulled the node
3115 map from the local node. thats why we have the hardcoded value 20
3117 for (i=0; i<ctdb->num_nodes; i++) {
3118 struct ctdb_banning_state *ban_state;
3120 if (ctdb->nodes[i]->ban_state == NULL) {
3123 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
3124 if (ban_state->count < 20) {
3127 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
3128 ctdb->nodes[i]->pnn, ban_state->count,
3129 ctdb->tunable.recovery_ban_period));
3130 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
3131 ban_state->count = 0;
3134 /* get relevant tunables */
3135 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3137 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3141 /* get the current recovery lock file from the server */
3142 if (update_recovery_lock_file(ctdb) != 0) {
3143 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3147 /* Make sure that if recovery lock verification becomes disabled when
3150 if (ctdb->tunable.verify_recovery_lock == 0) {
3151 if (ctdb->recovery_lock_fd != -1) {
3152 close(ctdb->recovery_lock_fd);
3153 ctdb->recovery_lock_fd = -1;
3157 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3158 if (pnn == (uint32_t)-1) {
3159 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3163 /* get the vnnmap */
3164 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3166 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3171 /* get number of nodes */
3173 talloc_free(rec->nodemap);
3174 rec->nodemap = NULL;
3177 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3179 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3182 nodemap = rec->nodemap;
3184 /* update the capabilities for all nodes */
3185 ret = update_capabilities(ctdb, nodemap);
3187 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3191 /* check which node is the recovery master */
3192 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3194 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3198 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3199 if (rec->recmaster != pnn) {
3200 if (rec->ip_reallocate_ctx != NULL) {
3201 talloc_free(rec->ip_reallocate_ctx);
3202 rec->ip_reallocate_ctx = NULL;
3203 rec->reallocate_callers = NULL;
3207 if (rec->recmaster == (uint32_t)-1) {
3208 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3209 force_election(rec, pnn, nodemap);
3213 /* if the local daemon is STOPPED, we verify that the databases are
3214 also frozen and thet the recmode is set to active
3216 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3217 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3219 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3221 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3222 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3224 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3226 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3229 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3231 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3238 /* If the local node is stopped, verify we are not the recmaster
3239 and yield this role if so
3241 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
3242 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
3243 force_election(rec, pnn, nodemap);
3248 * if the current recmaster do not have CTDB_CAP_RECMASTER,
3249 * but we have force an election and try to become the new
3252 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3253 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3254 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3255 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3256 " but we (node %u) have - force an election\n",
3257 rec->recmaster, pnn));
3258 force_election(rec, pnn, nodemap);
3262 /* check that we (recovery daemon) and the local ctdb daemon
3263 agrees on whether we are banned or not
3267 /* remember our own node flags */
3268 rec->node_flags = nodemap->nodes[pnn].flags;
3270 /* count how many active nodes there are */
3271 rec->num_active = 0;
3272 rec->num_connected = 0;
3273 for (i=0; i<nodemap->num; i++) {
3274 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3277 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3278 rec->num_connected++;
3283 /* verify that the recmaster node is still active */
3284 for (j=0; j<nodemap->num; j++) {
3285 if (nodemap->nodes[j].pnn==rec->recmaster) {
3290 if (j == nodemap->num) {
3291 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3292 force_election(rec, pnn, nodemap);
3296 /* if recovery master is disconnected we must elect a new recmaster */
3297 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3298 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3299 force_election(rec, pnn, nodemap);
3303 /* grap the nodemap from the recovery master to check if it is banned */
3304 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3305 mem_ctx, &recmaster_nodemap);
3307 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3308 nodemap->nodes[j].pnn));
3313 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3314 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3315 force_election(rec, pnn, nodemap);
3320 /* verify that we have all ip addresses we should have and we dont
3321 * have addresses we shouldnt have.
3323 if (ctdb->tunable.disable_ip_failover == 0) {
3324 if (rec->ip_check_disable_ctx == NULL) {
3325 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3326 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3332 /* if we are not the recmaster then we do not need to check
3333 if recovery is needed
3335 if (pnn != rec->recmaster) {
3340 /* ensure our local copies of flags are right */
3341 ret = update_local_flags(rec, nodemap);
3342 if (ret == MONITOR_ELECTION_NEEDED) {
3343 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3344 force_election(rec, pnn, nodemap);
3347 if (ret != MONITOR_OK) {
3348 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3352 if (ctdb->num_nodes != nodemap->num) {
3353 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3354 reload_nodes_file(ctdb);
3358 /* verify that all active nodes agree that we are the recmaster */
3359 switch (verify_recmaster(rec, nodemap, pnn)) {
3360 case MONITOR_RECOVERY_NEEDED:
3361 /* can not happen */
3363 case MONITOR_ELECTION_NEEDED:
3364 force_election(rec, pnn, nodemap);
3368 case MONITOR_FAILED:
3373 if (rec->need_recovery) {
3374 /* a previous recovery didn't finish */
3375 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3379 /* verify that all active nodes are in normal mode
3380 and not in recovery mode
3382 switch (verify_recmode(ctdb, nodemap)) {
3383 case MONITOR_RECOVERY_NEEDED:
3384 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3386 case MONITOR_FAILED:
3388 case MONITOR_ELECTION_NEEDED:
3389 /* can not happen */
3395 if (ctdb->tunable.verify_recovery_lock != 0) {
3396 /* we should have the reclock - check its not stale */
3397 ret = check_recovery_lock(ctdb);
3399 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3400 ctdb_set_culprit(rec, ctdb->pnn);
3401 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3406 /* if there are takeovers requested, perform it and notify the waiters */
3407 if (rec->reallocate_callers) {
3408 process_ipreallocate_requests(ctdb, rec);
3411 /* get the nodemap for all active remote nodes
3413 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3414 if (remote_nodemaps == NULL) {
3415 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3418 for(i=0; i<nodemap->num; i++) {
3419 remote_nodemaps[i] = NULL;
3421 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3422 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3426 /* verify that all other nodes have the same nodemap as we have
3428 for (j=0; j<nodemap->num; j++) {
3429 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3433 if (remote_nodemaps[j] == NULL) {
3434 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3435 ctdb_set_culprit(rec, j);
3440 /* if the nodes disagree on how many nodes there are
3441 then this is a good reason to try recovery
3443 if (remote_nodemaps[j]->num != nodemap->num) {
3444 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3445 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3446 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3447 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3451 /* if the nodes disagree on which nodes exist and are
3452 active, then that is also a good reason to do recovery
3454 for (i=0;i<nodemap->num;i++) {
3455 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3456 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3457 nodemap->nodes[j].pnn, i,
3458 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3459 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3460 do_recovery(rec, mem_ctx, pnn, nodemap,
3466 /* verify the flags are consistent
3468 for (i=0; i<nodemap->num; i++) {
3469 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3473 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3474 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3475 nodemap->nodes[j].pnn,
3476 nodemap->nodes[i].pnn,
3477 remote_nodemaps[j]->nodes[i].flags,
3478 nodemap->nodes[j].flags));
3480 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3481 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3482 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3483 do_recovery(rec, mem_ctx, pnn, nodemap,
3487 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3488 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3489 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3490 do_recovery(rec, mem_ctx, pnn, nodemap,
3499 /* there better be the same number of lmasters in the vnn map
3500 as there are active nodes or we will have to do a recovery
3502 if (vnnmap->size != rec->num_active) {
3503 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3504 vnnmap->size, rec->num_active));
3505 ctdb_set_culprit(rec, ctdb->pnn);
3506 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3510 /* verify that all active nodes in the nodemap also exist in
3513 for (j=0; j<nodemap->num; j++) {
3514 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3517 if (nodemap->nodes[j].pnn == pnn) {
3521 for (i=0; i<vnnmap->size; i++) {
3522 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3526 if (i == vnnmap->size) {
3527 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3528 nodemap->nodes[j].pnn));
3529 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3530 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3536 /* verify that all other nodes have the same vnnmap
3537 and are from the same generation
3539 for (j=0; j<nodemap->num; j++) {
3540 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3543 if (nodemap->nodes[j].pnn == pnn) {
3547 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3548 mem_ctx, &remote_vnnmap);
3550 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3551 nodemap->nodes[j].pnn));
3555 /* verify the vnnmap generation is the same */
3556 if (vnnmap->generation != remote_vnnmap->generation) {
3557 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3558 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3559 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3560 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3564 /* verify the vnnmap size is the same */
3565 if (vnnmap->size != remote_vnnmap->size) {
3566 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3567 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3568 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3569 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3573 /* verify the vnnmap is the same */
3574 for (i=0;i<vnnmap->size;i++) {
3575 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3576 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3577 nodemap->nodes[j].pnn));
3578 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3579 do_recovery(rec, mem_ctx, pnn, nodemap,
3586 /* we might need to change who has what IP assigned */
3587 if (rec->need_takeover_run) {
3588 uint32_t culprit = (uint32_t)-1;
3590 rec->need_takeover_run = false;
3592 /* update the list of public ips that a node can handle for
3595 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3597 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3599 rec->need_takeover_run = true;
3603 /* execute the "startrecovery" event script on all nodes */
3604 ret = run_startrecovery_eventscript(rec, nodemap);
3606 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3607 ctdb_set_culprit(rec, ctdb->pnn);
3608 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3612 ret = ctdb_takeover_run(ctdb, nodemap);
3614 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
3618 /* execute the "recovered" event script on all nodes */
3619 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3621 // we cant check whether the event completed successfully
3622 // since this script WILL fail if the node is in recovery mode
3623 // and if that race happens, the code here would just cause a second
3624 // cascading recovery.
3626 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3627 ctdb_set_culprit(rec, ctdb->pnn);
3628 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3635 the main monitoring loop
3637 static void monitor_cluster(struct ctdb_context *ctdb)
3639 struct ctdb_recoverd *rec;
3641 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3643 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3644 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3648 rec->priority_time = timeval_current();
3650 /* register a message port for sending memory dumps */
3651 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3653 /* register a message port for recovery elections */
3654 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3656 /* when nodes are disabled/enabled */
3657 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3659 /* when we are asked to puch out a flag change */
3660 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3662 /* register a message port for vacuum fetch */
3663 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3665 /* register a message port for reloadnodes */
3666 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3668 /* register a message port for performing a takeover run */
3669 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3671 /* register a message port for disabling the ip check for a short while */
3672 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3674 /* register a message port for updating the recovery daemons node assignment for an ip */
3675 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3677 /* register a message port for forcing a rebalance of a node next
3679 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3682 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3683 struct timeval start;
3687 DEBUG(DEBUG_CRIT,(__location__
3688 " Failed to create temp context\n"));
3692 start = timeval_current();
3693 main_loop(ctdb, rec, mem_ctx);
3694 talloc_free(mem_ctx);
3696 /* we only check for recovery once every second */
3697 elapsed = timeval_elapsed(&start);
3698 if (elapsed < ctdb->tunable.recover_interval) {
3699 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3706 event handler for when the main ctdbd dies
3708 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3709 uint16_t flags, void *private_data)
3711 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3716 called regularly to verify that the recovery daemon is still running
3718 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3719 struct timeval yt, void *p)
3721 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3723 if (kill(ctdb->recoverd_pid, 0) != 0) {
3724 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3726 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3727 ctdb_restart_recd, ctdb);
3732 event_add_timed(ctdb->ev, ctdb,
3733 timeval_current_ofs(30, 0),
3734 ctdb_check_recd, ctdb);
3737 static void recd_sig_child_handler(struct event_context *ev,
3738 struct signal_event *se, int signum, int count,
3742 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3747 pid = waitpid(-1, &status, WNOHANG);
3749 if (errno != ECHILD) {
3750 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3755 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3761 startup the recovery daemon as a child of the main ctdb daemon
3763 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3766 struct signal_event *se;
3767 struct tevent_fd *fde;
3769 if (pipe(fd) != 0) {
3773 ctdb->ctdbd_pid = getpid();
3775 ctdb->recoverd_pid = fork();
3776 if (ctdb->recoverd_pid == -1) {
3780 if (ctdb->recoverd_pid != 0) {
3782 event_add_timed(ctdb->ev, ctdb,
3783 timeval_current_ofs(30, 0),
3784 ctdb_check_recd, ctdb);
3790 srandom(getpid() ^ time(NULL));
3792 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
3793 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3797 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3799 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
3800 ctdb_recoverd_parent, &fd[0]);
3801 tevent_fd_set_auto_close(fde);
3803 /* set up a handler to pick up sigchld */
3804 se = event_add_signal(ctdb->ev, ctdb,
3806 recd_sig_child_handler,
3809 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3813 monitor_cluster(ctdb);
3815 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3820 shutdown the recovery daemon
3822 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3824 if (ctdb->recoverd_pid == 0) {
3828 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3829 kill(ctdb->recoverd_pid, SIGTERM);
3832 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
3833 struct timeval t, void *private_data)
3835 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3837 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3838 ctdb_stop_recoverd(ctdb);
3839 ctdb_start_recoverd(ctdb);