4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/util_process.h"
37 #include "ctdb_private.h"
38 #include "ctdb_client.h"
40 #include "common/system.h"
41 #include "common/cmdline.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
49 struct srvid_list *next, *prev;
50 struct ctdb_srvid_message *request;
53 struct srvid_requests {
54 struct srvid_list *requests;
57 static void srvid_request_reply(struct ctdb_context *ctdb,
58 struct ctdb_srvid_message *request,
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request->srvid == 0) {
67 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
69 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request->pnn,
71 (unsigned long long)request->srvid));
73 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request->pnn,
75 (unsigned long long)request->srvid));
81 static void srvid_requests_reply(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
87 for (r = (*requests)->requests; r != NULL; r = r->next) {
88 srvid_request_reply(ctdb, r->request, result);
91 /* Free the list structure... */
92 TALLOC_FREE(*requests);
95 static void srvid_request_add(struct ctdb_context *ctdb,
96 struct srvid_requests **requests,
97 struct ctdb_srvid_message *request)
103 if (*requests == NULL) {
104 *requests = talloc_zero(ctdb, struct srvid_requests);
105 if (*requests == NULL) {
110 t = talloc_zero(*requests, struct srvid_list);
112 /* If *requests was just allocated above then free it */
113 if ((*requests)->requests == NULL) {
114 TALLOC_FREE(*requests);
119 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
120 DLIST_ADD((*requests)->requests, t);
125 /* Failed to add the request to the list. Send a fail. */
126 DEBUG(DEBUG_ERR, (__location__
127 " Out of memory, failed to queue SRVID request\n"));
129 result.dsize = sizeof(ret);
130 result.dptr = (uint8_t *)&ret;
131 srvid_request_reply(ctdb, request, result);
134 /* An abstraction to allow an operation (takeover runs, recoveries,
135 * ...) to be disabled for a given timeout */
136 struct ctdb_op_state {
137 struct tevent_timer *timer;
142 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
144 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
147 state->in_progress = false;
154 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
156 return state->timer != NULL;
159 static bool ctdb_op_begin(struct ctdb_op_state *state)
161 if (ctdb_op_is_disabled(state)) {
163 ("Unable to begin - %s are disabled\n", state->name));
167 state->in_progress = true;
171 static bool ctdb_op_end(struct ctdb_op_state *state)
173 return state->in_progress = false;
176 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
178 return state->in_progress;
181 static void ctdb_op_enable(struct ctdb_op_state *state)
183 TALLOC_FREE(state->timer);
186 static void ctdb_op_timeout_handler(struct tevent_context *ev,
187 struct tevent_timer *te,
188 struct timeval yt, void *p)
190 struct ctdb_op_state *state =
191 talloc_get_type(p, struct ctdb_op_state);
193 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
194 ctdb_op_enable(state);
197 static int ctdb_op_disable(struct ctdb_op_state *state,
198 struct tevent_context *ev,
202 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
203 ctdb_op_enable(state);
207 if (state->in_progress) {
209 ("Unable to disable %s - in progress\n", state->name));
213 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
214 state->name, timeout));
216 /* Clear any old timers */
217 talloc_free(state->timer);
219 /* Arrange for the timeout to occur */
220 state->timer = tevent_add_timer(ev, state,
221 timeval_current_ofs(timeout, 0),
222 ctdb_op_timeout_handler, state);
223 if (state->timer == NULL) {
224 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
231 struct ctdb_banning_state {
233 struct timeval last_reported_time;
237 private state of recovery daemon
239 struct ctdb_recoverd {
240 struct ctdb_context *ctdb;
242 uint32_t last_culprit_node;
243 struct ctdb_node_map_old *nodemap;
244 struct timeval priority_time;
245 bool need_takeover_run;
248 struct tevent_timer *send_election_te;
249 struct tevent_timer *election_timeout;
250 struct srvid_requests *reallocate_requests;
251 struct ctdb_op_state *takeover_run;
252 struct ctdb_op_state *recovery;
253 struct ctdb_iface_list_old *ifaces;
254 uint32_t *force_rebalance_nodes;
255 struct ctdb_node_capabilities *caps;
258 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
259 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
261 static void ctdb_restart_recd(struct tevent_context *ev,
262 struct tevent_timer *te, struct timeval t,
266 ban a node for a period of time
268 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
271 struct ctdb_context *ctdb = rec->ctdb;
272 struct ctdb_ban_state bantime;
274 if (!ctdb_validate_pnn(ctdb, pnn)) {
275 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
279 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
282 bantime.time = ban_time;
284 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
286 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
292 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
296 remember the trouble maker
298 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
300 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
301 struct ctdb_banning_state *ban_state;
303 if (culprit > ctdb->num_nodes) {
304 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
308 /* If we are banned or stopped, do not set other nodes as culprits */
309 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
310 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
314 if (ctdb->nodes[culprit]->ban_state == NULL) {
315 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
316 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
320 ban_state = ctdb->nodes[culprit]->ban_state;
321 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
322 /* this was the first time in a long while this node
323 misbehaved so we will forgive any old transgressions.
325 ban_state->count = 0;
328 ban_state->count += count;
329 ban_state->last_reported_time = timeval_current();
330 rec->last_culprit_node = culprit;
334 remember the trouble maker
336 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
338 ctdb_set_culprit_count(rec, culprit, 1);
342 /* this callback is called for every node that failed to execute the
345 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
347 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
349 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
351 ctdb_set_culprit(rec, node_pnn);
355 run the "recovered" eventscript on all nodes
357 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
361 struct ctdb_context *ctdb = rec->ctdb;
363 tmp_ctx = talloc_new(ctdb);
364 CTDB_NO_MEMORY(ctdb, tmp_ctx);
366 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
367 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
369 CONTROL_TIMEOUT(), false, tdb_null,
370 NULL, recovered_fail_callback,
372 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
374 talloc_free(tmp_ctx);
378 talloc_free(tmp_ctx);
382 /* this callback is called for every node that failed to execute the
385 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
387 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
389 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
391 ctdb_set_culprit(rec, node_pnn);
395 run the "startrecovery" eventscript on all nodes
397 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
401 struct ctdb_context *ctdb = rec->ctdb;
403 tmp_ctx = talloc_new(ctdb);
404 CTDB_NO_MEMORY(ctdb, tmp_ctx);
406 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
407 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
409 CONTROL_TIMEOUT(), false, tdb_null,
411 startrecovery_fail_callback,
413 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
414 talloc_free(tmp_ctx);
418 talloc_free(tmp_ctx);
423 Retrieve capabilities from all connected nodes
425 static int update_capabilities(struct ctdb_recoverd *rec,
426 struct ctdb_node_map_old *nodemap)
430 struct ctdb_node_capabilities *caps;
431 struct ctdb_context *ctdb = rec->ctdb;
433 tmp_ctx = talloc_new(rec);
434 CTDB_NO_MEMORY(ctdb, tmp_ctx);
436 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
437 CONTROL_TIMEOUT(), nodemap);
441 (__location__ " Failed to get node capabilities\n"));
442 talloc_free(tmp_ctx);
446 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
450 " Capabilities don't include current node.\n"));
451 talloc_free(tmp_ctx);
454 ctdb->capabilities = *capp;
456 TALLOC_FREE(rec->caps);
457 rec->caps = talloc_steal(rec, caps);
459 talloc_free(tmp_ctx);
463 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
465 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
467 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
468 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
471 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
473 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
475 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
476 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
480 change recovery mode on all nodes
482 static int set_recovery_mode(struct ctdb_context *ctdb,
483 struct ctdb_recoverd *rec,
484 struct ctdb_node_map_old *nodemap,
485 uint32_t rec_mode, bool freeze)
491 tmp_ctx = talloc_new(ctdb);
492 CTDB_NO_MEMORY(ctdb, tmp_ctx);
494 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
496 data.dsize = sizeof(uint32_t);
497 data.dptr = (unsigned char *)&rec_mode;
499 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
505 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
506 talloc_free(tmp_ctx);
510 /* freeze all nodes */
511 if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
514 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
515 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
520 set_recmode_fail_callback,
522 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
523 talloc_free(tmp_ctx);
529 talloc_free(tmp_ctx);
533 /* update all remote nodes to use the same db priority that we have
534 this can fail if the remove node has not yet been upgraded to
535 support this function, so we always return success and never fail
536 a recovery if this call fails.
538 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
539 struct ctdb_node_map_old *nodemap,
540 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
544 /* step through all local databases */
545 for (db=0; db<dbmap->num;db++) {
546 struct ctdb_db_priority db_prio;
549 db_prio.db_id = dbmap->dbs[db].db_id;
550 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
552 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
556 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
558 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
559 CTDB_CURRENT_NODE, &db_prio);
561 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
570 ensure all other nodes have attached to any databases that we have
572 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
573 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
576 struct ctdb_dbid_map_old *remote_dbmap;
578 /* verify that all other nodes have all our databases */
579 for (j=0; j<nodemap->num; j++) {
580 /* we don't need to ourself ourselves */
581 if (nodemap->nodes[j].pnn == pnn) {
584 /* don't check nodes that are unavailable */
585 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
589 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
590 mem_ctx, &remote_dbmap);
592 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
596 /* step through all local databases */
597 for (db=0; db<dbmap->num;db++) {
601 for (i=0;i<remote_dbmap->num;i++) {
602 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
606 /* the remote node already have this database */
607 if (i!=remote_dbmap->num) {
610 /* ok so we need to create this database */
611 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
612 dbmap->dbs[db].db_id, mem_ctx,
615 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
618 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
619 nodemap->nodes[j].pnn,
621 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
623 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
634 ensure we are attached to any databases that anyone else is attached to
636 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
637 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
640 struct ctdb_dbid_map_old *remote_dbmap;
642 /* verify that we have all database any other node has */
643 for (j=0; j<nodemap->num; j++) {
644 /* we don't need to ourself ourselves */
645 if (nodemap->nodes[j].pnn == pnn) {
648 /* don't check nodes that are unavailable */
649 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
653 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
654 mem_ctx, &remote_dbmap);
656 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
660 /* step through all databases on the remote node */
661 for (db=0; db<remote_dbmap->num;db++) {
664 for (i=0;i<(*dbmap)->num;i++) {
665 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
669 /* we already have this db locally */
670 if (i!=(*dbmap)->num) {
673 /* ok so we need to create this database and
676 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
677 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
679 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
680 nodemap->nodes[j].pnn));
683 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
684 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
686 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
689 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
691 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
702 pull the remote database contents from one node into the recdb
704 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
705 struct tdb_wrap *recdb, uint32_t dbid)
709 struct ctdb_marshall_buffer *reply;
710 struct ctdb_rec_data_old *recdata;
712 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
714 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
715 CONTROL_TIMEOUT(), &outdata);
717 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
718 talloc_free(tmp_ctx);
722 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
724 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
725 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
726 talloc_free(tmp_ctx);
730 recdata = (struct ctdb_rec_data_old *)&reply->data[0];
734 recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
736 struct ctdb_ltdb_header *hdr;
739 key.dptr = &recdata->data[0];
740 key.dsize = recdata->keylen;
741 data.dptr = &recdata->data[key.dsize];
742 data.dsize = recdata->datalen;
744 hdr = (struct ctdb_ltdb_header *)data.dptr;
746 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
747 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
748 talloc_free(tmp_ctx);
752 /* fetch the existing record, if any */
753 existing = tdb_fetch(recdb->tdb, key);
755 if (existing.dptr != NULL) {
756 struct ctdb_ltdb_header header;
757 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
758 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
759 (unsigned)existing.dsize, srcnode));
761 talloc_free(tmp_ctx);
764 header = *(struct ctdb_ltdb_header *)existing.dptr;
766 if (!(header.rsn < hdr->rsn ||
767 (header.dmaster != ctdb_get_pnn(ctdb) &&
768 header.rsn == hdr->rsn))) {
773 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
774 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
775 talloc_free(tmp_ctx);
780 talloc_free(tmp_ctx);
786 struct pull_seqnum_cbdata {
792 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
794 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
797 if (cb_data->failed != 0) {
798 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
803 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
808 if (outdata.dsize != sizeof(uint64_t)) {
809 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
810 cb_data->failed = -1;
814 seqnum = *((uint64_t *)outdata.dptr);
816 if (seqnum > cb_data->seqnum ||
817 (cb_data->pnn == -1 && seqnum == 0)) {
818 cb_data->seqnum = seqnum;
819 cb_data->pnn = node_pnn;
823 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
825 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
827 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
831 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
832 struct ctdb_recoverd *rec,
833 struct ctdb_node_map_old *nodemap,
834 struct tdb_wrap *recdb, uint32_t dbid)
836 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
840 struct pull_seqnum_cbdata *cb_data;
842 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
847 data.dsize = sizeof(outdata);
848 data.dptr = (uint8_t *)&outdata[0];
850 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
851 if (cb_data == NULL) {
852 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
853 talloc_free(tmp_ctx);
861 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
862 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
864 CONTROL_TIMEOUT(), false, data,
868 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
870 talloc_free(tmp_ctx);
874 if (cb_data->failed != 0) {
875 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
876 talloc_free(tmp_ctx);
880 if (cb_data->pnn == -1) {
881 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
882 talloc_free(tmp_ctx);
886 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
888 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
889 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
890 talloc_free(tmp_ctx);
894 talloc_free(tmp_ctx);
900 pull all the remote database contents into the recdb
902 static int pull_remote_database(struct ctdb_context *ctdb,
903 struct ctdb_recoverd *rec,
904 struct ctdb_node_map_old *nodemap,
905 struct tdb_wrap *recdb, uint32_t dbid,
910 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
912 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
918 /* pull all records from all other nodes across onto this node
919 (this merges based on rsn)
921 for (j=0; j<nodemap->num; j++) {
922 /* don't merge from nodes that are unavailable */
923 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
926 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
927 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
928 nodemap->nodes[j].pnn));
929 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
939 update flags on all active nodes
941 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
945 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
947 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
955 ensure all nodes have the same vnnmap we do
957 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
958 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
962 /* push the new vnn map out to all the nodes */
963 for (j=0; j<nodemap->num; j++) {
964 /* don't push to nodes that are unavailable */
965 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
969 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
971 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
981 called when a vacuum fetch has completed - just free it and do the next one
983 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
990 * Process one elements of the vacuum fetch list:
991 * Migrate it over to us with the special flag
992 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
994 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
996 struct ctdb_rec_data_old *r)
998 struct ctdb_client_call_state *state;
1000 struct ctdb_ltdb_header *hdr;
1001 struct ctdb_call call;
1004 call.call_id = CTDB_NULL_FUNC;
1005 call.flags = CTDB_IMMEDIATE_MIGRATION;
1006 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1008 call.key.dptr = &r->data[0];
1009 call.key.dsize = r->keylen;
1011 /* ensure we don't block this daemon - just skip a record if we can't get
1013 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1017 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1018 if (data.dptr == NULL) {
1019 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1023 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1025 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1029 hdr = (struct ctdb_ltdb_header *)data.dptr;
1030 if (hdr->dmaster == pnn) {
1031 /* its already local */
1033 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1039 state = ctdb_call_send(ctdb_db, &call);
1040 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1041 if (state == NULL) {
1042 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1045 state->async.fn = vacuum_fetch_callback;
1046 state->async.private_data = NULL;
1053 handler for vacuum fetch
1055 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1058 struct ctdb_recoverd *rec = talloc_get_type(
1059 private_data, struct ctdb_recoverd);
1060 struct ctdb_context *ctdb = rec->ctdb;
1061 struct ctdb_marshall_buffer *recs;
1063 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1065 struct ctdb_dbid_map_old *dbmap=NULL;
1066 bool persistent = false;
1067 struct ctdb_db_context *ctdb_db;
1068 struct ctdb_rec_data_old *r;
1070 recs = (struct ctdb_marshall_buffer *)data.dptr;
1072 if (recs->count == 0) {
1076 /* work out if the database is persistent */
1077 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1079 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1083 for (i=0;i<dbmap->num;i++) {
1084 if (dbmap->dbs[i].db_id == recs->db_id) {
1085 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1089 if (i == dbmap->num) {
1090 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1094 /* find the name of this database */
1095 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1096 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1101 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1102 if (ctdb_db == NULL) {
1103 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1107 r = (struct ctdb_rec_data_old *)&recs->data[0];
1108 while (recs->count) {
1111 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1116 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
1121 talloc_free(tmp_ctx);
1126 * handler for database detach
1128 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1131 struct ctdb_recoverd *rec = talloc_get_type(
1132 private_data, struct ctdb_recoverd);
1133 struct ctdb_context *ctdb = rec->ctdb;
1135 struct ctdb_db_context *ctdb_db;
1137 if (data.dsize != sizeof(db_id)) {
1140 db_id = *(uint32_t *)data.dptr;
1142 ctdb_db = find_ctdb_db(ctdb, db_id);
1143 if (ctdb_db == NULL) {
1144 /* database is not attached */
1148 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1150 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1152 talloc_free(ctdb_db);
1156 called when ctdb_wait_timeout should finish
1158 static void ctdb_wait_handler(struct tevent_context *ev,
1159 struct tevent_timer *te,
1160 struct timeval yt, void *p)
1162 uint32_t *timed_out = (uint32_t *)p;
1167 wait for a given number of seconds
1169 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1171 uint32_t timed_out = 0;
1172 time_t usecs = (secs - (time_t)secs) * 1000000;
1173 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1174 ctdb_wait_handler, &timed_out);
1175 while (!timed_out) {
1176 tevent_loop_once(ctdb->ev);
1181 called when an election times out (ends)
1183 static void ctdb_election_timeout(struct tevent_context *ev,
1184 struct tevent_timer *te,
1185 struct timeval t, void *p)
1187 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1188 rec->election_timeout = NULL;
1191 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1196 wait for an election to finish. It finished election_timeout seconds after
1197 the last election packet is received
1199 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1201 struct ctdb_context *ctdb = rec->ctdb;
1202 while (rec->election_timeout) {
1203 tevent_loop_once(ctdb->ev);
1208 Update our local flags from all remote connected nodes.
1209 This is only run when we are or we belive we are the recovery master
1211 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
1214 struct ctdb_context *ctdb = rec->ctdb;
1215 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1217 /* get the nodemap for all active remote nodes and verify
1218 they are the same as for this node
1220 for (j=0; j<nodemap->num; j++) {
1221 struct ctdb_node_map_old *remote_nodemap=NULL;
1224 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1227 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1231 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1232 mem_ctx, &remote_nodemap);
1234 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1235 nodemap->nodes[j].pnn));
1236 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1237 talloc_free(mem_ctx);
1240 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1241 /* We should tell our daemon about this so it
1242 updates its flags or else we will log the same
1243 message again in the next iteration of recovery.
1244 Since we are the recovery master we can just as
1245 well update the flags on all nodes.
1247 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1249 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1253 /* Update our local copy of the flags in the recovery
1256 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1257 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1258 nodemap->nodes[j].flags));
1259 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1261 talloc_free(remote_nodemap);
1263 talloc_free(mem_ctx);
1268 /* Create a new random generation id.
1269 The generation id can not be the INVALID_GENERATION id
1271 static uint32_t new_generation(void)
1273 uint32_t generation;
1276 generation = random();
1278 if (generation != INVALID_GENERATION) {
1288 create a temporary working database
1290 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1293 struct tdb_wrap *recdb;
1296 /* open up the temporary recovery database */
1297 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1298 ctdb->db_directory_state,
1305 tdb_flags = TDB_NOLOCK;
1306 if (ctdb->valgrinding) {
1307 tdb_flags |= TDB_NOMMAP;
1309 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1311 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1312 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1313 if (recdb == NULL) {
1314 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1324 a traverse function for pulling all relevant records from recdb
1327 struct ctdb_context *ctdb;
1328 struct ctdb_marshall_buffer *recdata;
1330 uint32_t allocated_len;
1335 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1337 struct recdb_data *params = (struct recdb_data *)p;
1338 struct ctdb_rec_data_old *recdata;
1339 struct ctdb_ltdb_header *hdr;
1342 * skip empty records - but NOT for persistent databases:
1344 * The record-by-record mode of recovery deletes empty records.
1345 * For persistent databases, this can lead to data corruption
1346 * by deleting records that should be there:
1348 * - Assume the cluster has been running for a while.
1350 * - A record R in a persistent database has been created and
1351 * deleted a couple of times, the last operation being deletion,
1352 * leaving an empty record with a high RSN, say 10.
1354 * - Now a node N is turned off.
1356 * - This leaves the local database copy of D on N with the empty
1357 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1358 * the copy of record R.
1360 * - Now the record is created again while node N is turned off.
1361 * This creates R with RSN = 1 on all nodes except for N.
1363 * - Now node N is turned on again. The following recovery will chose
1364 * the older empty copy of R due to RSN 10 > RSN 1.
1366 * ==> Hence the record is gone after the recovery.
1368 * On databases like Samba's registry, this can damage the higher-level
1369 * data structures built from the various tdb-level records.
1371 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1375 /* update the dmaster field to point to us */
1376 hdr = (struct ctdb_ltdb_header *)data.dptr;
1377 if (!params->persistent) {
1378 hdr->dmaster = params->ctdb->pnn;
1379 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1382 /* add the record to the blob ready to send to the nodes */
1383 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1384 if (recdata == NULL) {
1385 params->failed = true;
1388 if (params->len + recdata->length >= params->allocated_len) {
1389 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1390 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1392 if (params->recdata == NULL) {
1393 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1394 recdata->length + params->len));
1395 params->failed = true;
1398 params->recdata->count++;
1399 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1400 params->len += recdata->length;
1401 talloc_free(recdata);
1407 push the recdb database out to all nodes
1409 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1411 struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
1413 struct recdb_data params;
1414 struct ctdb_marshall_buffer *recdata;
1416 TALLOC_CTX *tmp_ctx;
1419 tmp_ctx = talloc_new(ctdb);
1420 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1422 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1423 CTDB_NO_MEMORY(ctdb, recdata);
1425 recdata->db_id = dbid;
1428 params.recdata = recdata;
1429 params.len = offsetof(struct ctdb_marshall_buffer, data);
1430 params.allocated_len = params.len;
1431 params.failed = false;
1432 params.persistent = persistent;
1434 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1435 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1436 talloc_free(params.recdata);
1437 talloc_free(tmp_ctx);
1441 if (params.failed) {
1442 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1443 talloc_free(params.recdata);
1444 talloc_free(tmp_ctx);
1448 recdata = params.recdata;
1450 outdata.dptr = (void *)recdata;
1451 outdata.dsize = params.len;
1453 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1454 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1456 CONTROL_TIMEOUT(), false, outdata,
1459 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1460 talloc_free(recdata);
1461 talloc_free(tmp_ctx);
1465 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1466 dbid, recdata->count));
1468 talloc_free(recdata);
1469 talloc_free(tmp_ctx);
1476 go through a full recovery on one database
1478 static int recover_database(struct ctdb_recoverd *rec,
1479 TALLOC_CTX *mem_ctx,
1483 struct ctdb_node_map_old *nodemap,
1484 uint32_t transaction_id)
1486 struct tdb_wrap *recdb;
1488 struct ctdb_context *ctdb = rec->ctdb;
1490 struct ctdb_transdb w;
1493 recdb = create_recdb(ctdb, mem_ctx);
1494 if (recdb == NULL) {
1498 /* pull all remote databases onto the recdb */
1499 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1501 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1505 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1507 /* wipe all the remote databases. This is safe as we are in a transaction */
1509 w.tid = transaction_id;
1511 data.dptr = (void *)&w;
1512 data.dsize = sizeof(w);
1514 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1515 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1517 CONTROL_TIMEOUT(), false, data,
1520 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1525 /* push out the correct database. This sets the dmaster and skips
1526 the empty records */
1527 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1533 /* all done with this database */
1539 static bool ctdb_recovery_have_lock(struct ctdb_context *ctdb)
1541 return (ctdb->recovery_lock_handle != NULL);
1544 struct hold_reclock_state {
1549 static void hold_reclock_handler(struct ctdb_context *ctdb,
1552 struct ctdb_cluster_mutex_handle *h,
1555 struct hold_reclock_state *s =
1556 (struct hold_reclock_state *) private_data;
1560 ctdb->recovery_lock_handle = h;
1561 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
1567 ("Unable to take recovery lock - contention\n"));
1572 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
1580 static bool ctdb_recovery_lock(struct ctdb_context *ctdb)
1582 struct ctdb_cluster_mutex_handle *h;
1583 struct hold_reclock_state s = {
1588 h = ctdb_cluster_mutex(ctdb, ctdb->recovery_lock_file, 0);
1593 ctdb_cluster_mutex_set_handler(h, hold_reclock_handler, &s);
1596 tevent_loop_once(ctdb->ev);
1599 /* Ensure no attempts to access to s after function return */
1600 ctdb_cluster_mutex_set_handler(h, hold_reclock_handler, NULL);
1602 return (s.status == '0');
1605 static void ctdb_recovery_unlock(struct ctdb_context *ctdb)
1607 if (ctdb->recovery_lock_handle != NULL) {
1608 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
1609 TALLOC_FREE(ctdb->recovery_lock_handle);
1613 /* when we start a recovery, make sure all nodes use the same reclock file
1616 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1618 struct ctdb_context *ctdb = rec->ctdb;
1619 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1623 if (ctdb->recovery_lock_file == NULL) {
1627 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1628 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1631 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1632 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1638 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1639 talloc_free(tmp_ctx);
1643 talloc_free(tmp_ctx);
1649 * this callback is called for every node that failed to execute ctdb_takeover_run()
1650 * and set flag to re-run takeover run.
1652 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1654 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1656 if (callback_data != NULL) {
1657 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1659 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1661 ctdb_set_culprit(rec, node_pnn);
1666 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1668 struct ctdb_context *ctdb = rec->ctdb;
1670 struct ctdb_banning_state *ban_state;
1673 for (i=0; i<ctdb->num_nodes; i++) {
1674 if (ctdb->nodes[i]->ban_state == NULL) {
1677 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1678 if (ban_state->count < 2*ctdb->num_nodes) {
1682 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1683 ctdb->nodes[i]->pnn, ban_state->count,
1684 ctdb->tunable.recovery_ban_period));
1685 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1686 ban_state->count = 0;
1688 /* Banning ourself? */
1689 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1695 static bool do_takeover_run(struct ctdb_recoverd *rec,
1696 struct ctdb_node_map_old *nodemap,
1697 bool banning_credits_on_fail)
1699 uint32_t *nodes = NULL;
1700 struct ctdb_disable_message dtr;
1703 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1707 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1709 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1710 DEBUG(DEBUG_ERR, (__location__
1711 " takeover run already in progress \n"));
1716 if (!ctdb_op_begin(rec->takeover_run)) {
1721 /* Disable IP checks (takeover runs, really) on other nodes
1722 * while doing this takeover run. This will stop those other
1723 * nodes from triggering takeover runs when think they should
1724 * be hosting an IP but it isn't yet on an interface. Don't
1725 * wait for replies since a failure here might cause some
1726 * noise in the logs but will not actually cause a problem.
1729 dtr.srvid = 0; /* No reply */
1732 data.dptr = (uint8_t*)&dtr;
1733 data.dsize = sizeof(dtr);
1735 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1737 /* Disable for 60 seconds. This can be a tunable later if
1741 for (i = 0; i < talloc_array_length(nodes); i++) {
1742 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1743 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1745 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1749 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1750 rec->force_rebalance_nodes,
1751 takeover_fail_callback,
1752 banning_credits_on_fail ? rec : NULL);
1754 /* Reenable takeover runs and IP checks on other nodes */
1756 for (i = 0; i < talloc_array_length(nodes); i++) {
1757 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1758 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1760 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1765 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1771 /* Takeover run was successful so clear force rebalance targets */
1772 if (rebalance_nodes == rec->force_rebalance_nodes) {
1773 TALLOC_FREE(rec->force_rebalance_nodes);
1775 DEBUG(DEBUG_WARNING,
1776 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1779 rec->need_takeover_run = !ok;
1781 ctdb_op_end(rec->takeover_run);
1783 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1787 struct recovery_helper_state {
1794 static void ctdb_recovery_handler(struct tevent_context *ev,
1795 struct tevent_fd *fde,
1796 uint16_t flags, void *private_data)
1798 struct recovery_helper_state *state = talloc_get_type_abort(
1799 private_data, struct recovery_helper_state);
1802 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1803 if (ret != sizeof(state->result)) {
1804 state->result = EPIPE;
1811 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1813 static char prog[PATH_MAX+1] = "";
1815 struct recovery_helper_state *state;
1816 struct tevent_fd *fde;
1819 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1820 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1821 "ctdb_recovery_helper")) {
1822 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1825 state = talloc_zero(mem_ctx, struct recovery_helper_state);
1826 if (state == NULL) {
1827 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1833 ret = pipe(state->fd);
1836 ("Failed to create pipe for recovery helper\n"));
1840 set_close_on_exec(state->fd[0]);
1843 args = talloc_array(state, const char *, nargs);
1845 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1849 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1850 args[1] = rec->ctdb->daemon.name;
1851 args[2] = talloc_asprintf(args, "%u", new_generation());
1854 if (args[0] == NULL || args[2] == NULL) {
1855 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1859 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1861 if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1862 args, NULL, NULL, &state->pid)) {
1864 ("Failed to create child for recovery helper\n"));
1868 close(state->fd[1]);
1871 state->done = false;
1873 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1874 TEVENT_FD_READ, ctdb_recovery_handler, state);
1878 tevent_fd_set_auto_close(fde);
1880 while (!state->done) {
1881 tevent_loop_once(rec->ctdb->ev);
1884 close(state->fd[0]);
1887 if (state->result != 0) {
1891 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1896 if (state->fd[0] != -1) {
1897 close(state->fd[0]);
1899 if (state->fd[1] != -1) {
1900 close(state->fd[1]);
1902 if (state->pid != -1) {
1903 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1909 static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1910 uint32_t pnn, struct ctdb_node_map_old *nodemap,
1911 struct ctdb_vnn_map *vnnmap,
1912 struct ctdb_dbid_map_old *dbmap)
1914 struct ctdb_context *ctdb = rec->ctdb;
1915 uint32_t generation;
1920 /* set recovery mode to active on all nodes */
1921 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1923 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1927 /* execute the "startrecovery" event script on all nodes */
1928 ret = run_startrecovery_eventscript(rec, nodemap);
1930 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1934 /* pick a new generation number */
1935 generation = new_generation();
1937 /* change the vnnmap on this node to use the new generation
1938 number but not on any other nodes.
1939 this guarantees that if we abort the recovery prematurely
1940 for some reason (a node stops responding?)
1941 that we can just return immediately and we will reenter
1942 recovery shortly again.
1943 I.e. we deliberately leave the cluster with an inconsistent
1944 generation id to allow us to abort recovery at any stage and
1945 just restart it from scratch.
1947 vnnmap->generation = generation;
1948 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1950 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1954 /* Database generations are updated when the transaction is commited to
1955 * the databases. So make sure to use the final generation as the
1958 generation = new_generation();
1960 data.dptr = (void *)&generation;
1961 data.dsize = sizeof(uint32_t);
1963 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1964 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1966 CONTROL_TIMEOUT(), false, data,
1968 transaction_start_fail_callback,
1970 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1971 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1973 CONTROL_TIMEOUT(), false, tdb_null,
1977 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1982 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1984 for (i=0;i<dbmap->num;i++) {
1985 ret = recover_database(rec, mem_ctx,
1986 dbmap->dbs[i].db_id,
1987 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1988 pnn, nodemap, generation);
1990 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
1995 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1997 /* commit all the changes */
1998 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2000 CONTROL_TIMEOUT(), false, data,
2003 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2007 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2009 /* build a new vnn map with all the currently active and
2011 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2012 CTDB_NO_MEMORY(ctdb, vnnmap);
2013 vnnmap->generation = generation;
2015 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2016 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2017 for (i=j=0;i<nodemap->num;i++) {
2018 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2021 if (!ctdb_node_has_capabilities(rec->caps,
2022 ctdb->nodes[i]->pnn,
2023 CTDB_CAP_LMASTER)) {
2024 /* this node can not be an lmaster */
2025 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2030 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2031 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2032 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2035 if (vnnmap->size == 0) {
2036 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2038 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2039 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2040 vnnmap->map[0] = pnn;
2043 /* update to the new vnnmap on all nodes */
2044 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2046 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2050 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2052 /* disable recovery mode */
2053 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
2055 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2059 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2065 we are the recmaster, and recovery is needed - start a recovery run
2067 static int do_recovery(struct ctdb_recoverd *rec,
2068 TALLOC_CTX *mem_ctx, uint32_t pnn,
2069 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
2071 struct ctdb_context *ctdb = rec->ctdb;
2073 struct ctdb_dbid_map_old *dbmap;
2077 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2079 /* Check if the current node is still the recmaster. It's possible that
2080 * re-election has changed the recmaster.
2082 if (pnn != rec->recmaster) {
2084 ("Recovery master changed to %u, aborting recovery\n",
2089 /* if recovery fails, force it again */
2090 rec->need_recovery = true;
2092 if (!ctdb_op_begin(rec->recovery)) {
2096 if (rec->election_timeout) {
2097 /* an election is in progress */
2098 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2102 ban_misbehaving_nodes(rec, &self_ban);
2104 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2108 if (ctdb->recovery_lock_file != NULL) {
2109 if (ctdb_recovery_have_lock(ctdb)) {
2110 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2112 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2113 ctdb->recovery_lock_file));
2114 if (!ctdb_recovery_lock(ctdb)) {
2115 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2116 /* If ctdb is trying first recovery, it's
2117 * possible that current node does not know
2118 * yet who the recmaster is.
2120 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2121 " - retrying recovery\n"));
2125 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2126 "and ban ourself for %u seconds\n",
2127 ctdb->tunable.recovery_ban_period));
2128 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2132 ("Recovery lock taken successfully by recovery daemon\n"));
2136 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2138 /* get a list of all databases */
2139 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2141 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2145 /* we do the db creation before we set the recovery mode, so the freeze happens
2146 on all databases we will be dealing with. */
2148 /* verify that we have all the databases any other node has */
2149 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2151 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2155 /* verify that all other nodes have all our databases */
2156 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2158 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2161 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2163 /* update the database priority for all remote databases */
2164 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2166 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2168 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2171 /* update all other nodes to use the same setting for reclock files
2172 as the local recovery master.
2174 sync_recovery_lock_file_across_cluster(rec);
2176 /* Retrieve capabilities from all connected nodes */
2177 ret = update_capabilities(rec, nodemap);
2179 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2184 update all nodes to have the same flags that we have
2186 for (i=0;i<nodemap->num;i++) {
2187 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2191 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2193 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2194 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2196 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2202 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2204 /* Check if all participating nodes have parallel recovery capability */
2205 par_recovery = true;
2206 for (i=0; i<nodemap->num; i++) {
2207 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2211 if (!(rec->caps[i].capabilities &
2212 CTDB_CAP_PARALLEL_RECOVERY)) {
2213 par_recovery = false;
2219 ret = db_recovery_parallel(rec, mem_ctx);
2221 ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2229 do_takeover_run(rec, nodemap, false);
2231 /* execute the "recovered" event script on all nodes */
2232 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2234 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2238 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2240 /* send a message to all clients telling them that the cluster
2241 has been reconfigured */
2242 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2243 CTDB_SRVID_RECONFIGURE, tdb_null);
2245 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2249 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2251 rec->need_recovery = false;
2252 ctdb_op_end(rec->recovery);
2254 /* we managed to complete a full recovery, make sure to forgive
2255 any past sins by the nodes that could now participate in the
2258 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2259 for (i=0;i<nodemap->num;i++) {
2260 struct ctdb_banning_state *ban_state;
2262 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2266 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2267 if (ban_state == NULL) {
2271 ban_state->count = 0;
2274 /* We just finished a recovery successfully.
2275 We now wait for rerecovery_timeout before we allow
2276 another recovery to take place.
2278 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2279 ctdb_op_disable(rec->recovery, ctdb->ev,
2280 ctdb->tunable.rerecovery_timeout);
2284 ctdb_op_end(rec->recovery);
2290 elections are won by first checking the number of connected nodes, then
2291 the priority time, then the pnn
2293 struct election_message {
2294 uint32_t num_connected;
2295 struct timeval priority_time;
2297 uint32_t node_flags;
2301 form this nodes election data
2303 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2306 struct ctdb_node_map_old *nodemap;
2307 struct ctdb_context *ctdb = rec->ctdb;
2311 em->pnn = rec->ctdb->pnn;
2312 em->priority_time = rec->priority_time;
2314 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2316 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2320 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2321 em->node_flags = rec->node_flags;
2323 for (i=0;i<nodemap->num;i++) {
2324 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2325 em->num_connected++;
2329 /* we shouldnt try to win this election if we cant be a recmaster */
2330 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2331 em->num_connected = 0;
2332 em->priority_time = timeval_current();
2335 talloc_free(nodemap);
2339 see if the given election data wins
2341 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2343 struct election_message myem;
2346 ctdb_election_data(rec, &myem);
2348 /* we cant win if we don't have the recmaster capability */
2349 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2353 /* we cant win if we are banned */
2354 if (rec->node_flags & NODE_FLAGS_BANNED) {
2358 /* we cant win if we are stopped */
2359 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2363 /* we will automatically win if the other node is banned */
2364 if (em->node_flags & NODE_FLAGS_BANNED) {
2368 /* we will automatically win if the other node is banned */
2369 if (em->node_flags & NODE_FLAGS_STOPPED) {
2373 /* then the longest running node */
2375 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2379 cmp = (int)myem.pnn - (int)em->pnn;
2386 send out an election request
2388 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2391 TDB_DATA election_data;
2392 struct election_message emsg;
2394 struct ctdb_context *ctdb = rec->ctdb;
2396 srvid = CTDB_SRVID_ELECTION;
2398 ctdb_election_data(rec, &emsg);
2400 election_data.dsize = sizeof(struct election_message);
2401 election_data.dptr = (unsigned char *)&emsg;
2404 /* first we assume we will win the election and set
2405 recoverymaster to be ourself on the current node
2407 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2408 CTDB_CURRENT_NODE, pnn);
2410 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
2413 rec->recmaster = pnn;
2415 /* send an election message to all active nodes */
2416 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2417 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2421 we think we are winning the election - send a broadcast election request
2423 static void election_send_request(struct tevent_context *ev,
2424 struct tevent_timer *te,
2425 struct timeval t, void *p)
2427 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2430 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2432 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2435 TALLOC_FREE(rec->send_election_te);
2439 handler for memory dumps
2441 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2443 struct ctdb_recoverd *rec = talloc_get_type(
2444 private_data, struct ctdb_recoverd);
2445 struct ctdb_context *ctdb = rec->ctdb;
2446 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2449 struct ctdb_srvid_message *rd;
2451 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2452 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2453 talloc_free(tmp_ctx);
2456 rd = (struct ctdb_srvid_message *)data.dptr;
2458 dump = talloc_zero(tmp_ctx, TDB_DATA);
2460 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2461 talloc_free(tmp_ctx);
2464 ret = ctdb_dump_memory(ctdb, dump);
2466 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2467 talloc_free(tmp_ctx);
2471 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2473 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2475 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2476 talloc_free(tmp_ctx);
2480 talloc_free(tmp_ctx);
2484 handler for reload_nodes
2486 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2489 struct ctdb_recoverd *rec = talloc_get_type(
2490 private_data, struct ctdb_recoverd);
2492 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2494 ctdb_load_nodes_file(rec->ctdb);
2498 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2501 struct ctdb_recoverd *rec = talloc_get_type(
2502 private_data, struct ctdb_recoverd);
2503 struct ctdb_context *ctdb = rec->ctdb;
2508 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2512 if (data.dsize != sizeof(uint32_t)) {
2513 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2517 pnn = *(uint32_t *)&data.dptr[0];
2519 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2521 /* Copy any existing list of nodes. There's probably some
2522 * sort of realloc variant that will do this but we need to
2523 * make sure that freeing the old array also cancels the timer
2524 * event for the timeout... not sure if realloc will do that.
2526 len = (rec->force_rebalance_nodes != NULL) ?
2527 talloc_array_length(rec->force_rebalance_nodes) :
2530 /* This allows duplicates to be added but they don't cause
2531 * harm. A call to add a duplicate PNN arguably means that
2532 * the timeout should be reset, so this is the simplest
2535 t = talloc_zero_array(rec, uint32_t, len+1);
2536 CTDB_NO_MEMORY_VOID(ctdb, t);
2538 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2542 talloc_free(rec->force_rebalance_nodes);
2544 rec->force_rebalance_nodes = t;
2549 static void recd_update_ip_handler(uint64_t srvid, TDB_DATA data,
2552 struct ctdb_recoverd *rec = talloc_get_type(
2553 private_data, struct ctdb_recoverd);
2554 struct ctdb_public_ip *ip;
2556 if (rec->recmaster != rec->ctdb->pnn) {
2557 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2561 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2562 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2566 ip = (struct ctdb_public_ip *)data.dptr;
2568 update_ip_assignment_tree(rec->ctdb, ip);
2571 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2573 struct ctdb_op_state *op_state)
2575 struct ctdb_disable_message *r;
2580 /* Validate input data */
2581 if (data.dsize != sizeof(struct ctdb_disable_message)) {
2582 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2583 "expecting %lu\n", (long unsigned)data.dsize,
2584 (long unsigned)sizeof(struct ctdb_srvid_message)));
2587 if (data.dptr == NULL) {
2588 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2592 r = (struct ctdb_disable_message *)data.dptr;
2593 timeout = r->timeout;
2595 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2600 /* Returning our PNN tells the caller that we succeeded */
2601 ret = ctdb_get_pnn(ctdb);
2603 result.dsize = sizeof(int32_t);
2604 result.dptr = (uint8_t *)&ret;
2605 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
2608 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2611 struct ctdb_recoverd *rec = talloc_get_type(
2612 private_data, struct ctdb_recoverd);
2614 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2617 /* Backward compatibility for this SRVID */
2618 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2621 struct ctdb_recoverd *rec = talloc_get_type(
2622 private_data, struct ctdb_recoverd);
2625 if (data.dsize != sizeof(uint32_t)) {
2626 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2627 "expecting %lu\n", (long unsigned)data.dsize,
2628 (long unsigned)sizeof(uint32_t)));
2631 if (data.dptr == NULL) {
2632 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2636 timeout = *((uint32_t *)data.dptr);
2638 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2641 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2644 struct ctdb_recoverd *rec = talloc_get_type(
2645 private_data, struct ctdb_recoverd);
2647 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2651 handler for ip reallocate, just add it to the list of requests and
2652 handle this later in the monitor_cluster loop so we do not recurse
2653 with other requests to takeover_run()
2655 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2658 struct ctdb_srvid_message *request;
2659 struct ctdb_recoverd *rec = talloc_get_type(
2660 private_data, struct ctdb_recoverd);
2662 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2663 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2667 request = (struct ctdb_srvid_message *)data.dptr;
2669 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2672 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2673 struct ctdb_recoverd *rec)
2677 struct srvid_requests *current;
2679 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2681 /* Only process requests that are currently pending. More
2682 * might come in while the takeover run is in progress and
2683 * they will need to be processed later since they might
2684 * be in response flag changes.
2686 current = rec->reallocate_requests;
2687 rec->reallocate_requests = NULL;
2689 if (do_takeover_run(rec, rec->nodemap, false)) {
2690 ret = ctdb_get_pnn(ctdb);
2695 result.dsize = sizeof(int32_t);
2696 result.dptr = (uint8_t *)&ret;
2698 srvid_requests_reply(ctdb, ¤t, result);
2702 * handler for assigning banning credits
2704 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2706 struct ctdb_recoverd *rec = talloc_get_type(
2707 private_data, struct ctdb_recoverd);
2710 /* Ignore if we are not recmaster */
2711 if (rec->ctdb->pnn != rec->recmaster) {
2715 if (data.dsize != sizeof(uint32_t)) {
2716 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
2721 ban_pnn = *(uint32_t *)data.dptr;
2723 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
2727 handler for recovery master elections
2729 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2731 struct ctdb_recoverd *rec = talloc_get_type(
2732 private_data, struct ctdb_recoverd);
2733 struct ctdb_context *ctdb = rec->ctdb;
2735 struct election_message *em = (struct election_message *)data.dptr;
2737 /* Ignore election packets from ourself */
2738 if (ctdb->pnn == em->pnn) {
2742 /* we got an election packet - update the timeout for the election */
2743 talloc_free(rec->election_timeout);
2744 rec->election_timeout = tevent_add_timer(
2747 timeval_current_ofs(0, 500000) :
2748 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2749 ctdb_election_timeout, rec);
2751 /* someone called an election. check their election data
2752 and if we disagree and we would rather be the elected node,
2753 send a new election message to all other nodes
2755 if (ctdb_election_win(rec, em)) {
2756 if (!rec->send_election_te) {
2757 rec->send_election_te = tevent_add_timer(
2759 timeval_current_ofs(0, 500000),
2760 election_send_request, rec);
2766 TALLOC_FREE(rec->send_election_te);
2768 /* Release the recovery lock file */
2769 if (ctdb_recovery_have_lock(ctdb)) {
2770 ctdb_recovery_unlock(ctdb);
2773 clear_ip_assignment_tree(ctdb);
2775 /* ok, let that guy become recmaster then */
2776 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2777 CTDB_CURRENT_NODE, em->pnn);
2779 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
2782 rec->recmaster = em->pnn;
2789 force the start of the election process
2791 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2792 struct ctdb_node_map_old *nodemap)
2795 struct ctdb_context *ctdb = rec->ctdb;
2797 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2799 /* set all nodes to recovery mode to stop all internode traffic */
2800 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2802 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2806 talloc_free(rec->election_timeout);
2807 rec->election_timeout = tevent_add_timer(
2810 timeval_current_ofs(0, 500000) :
2811 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2812 ctdb_election_timeout, rec);
2814 ret = send_election_request(rec, pnn);
2816 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2820 /* wait for a few seconds to collect all responses */
2821 ctdb_wait_election(rec);
2827 handler for when a node changes its flags
2829 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2831 struct ctdb_recoverd *rec = talloc_get_type(
2832 private_data, struct ctdb_recoverd);
2833 struct ctdb_context *ctdb = rec->ctdb;
2835 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2836 struct ctdb_node_map_old *nodemap=NULL;
2837 TALLOC_CTX *tmp_ctx;
2839 int disabled_flag_changed;
2841 if (data.dsize != sizeof(*c)) {
2842 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2846 tmp_ctx = talloc_new(ctdb);
2847 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2849 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2851 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2852 talloc_free(tmp_ctx);
2857 for (i=0;i<nodemap->num;i++) {
2858 if (nodemap->nodes[i].pnn == c->pnn) break;
2861 if (i == nodemap->num) {
2862 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2863 talloc_free(tmp_ctx);
2867 if (c->old_flags != c->new_flags) {
2868 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2871 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2873 nodemap->nodes[i].flags = c->new_flags;
2875 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2876 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2879 rec->recmaster == ctdb->pnn &&
2880 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2881 /* Only do the takeover run if the perm disabled or unhealthy
2882 flags changed since these will cause an ip failover but not
2884 If the node became disconnected or banned this will also
2885 lead to an ip address failover but that is handled
2888 if (disabled_flag_changed) {
2889 rec->need_takeover_run = true;
2893 talloc_free(tmp_ctx);
2897 handler for when we need to push out flag changes ot all other nodes
2899 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2902 struct ctdb_recoverd *rec = talloc_get_type(
2903 private_data, struct ctdb_recoverd);
2904 struct ctdb_context *ctdb = rec->ctdb;
2906 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2907 struct ctdb_node_map_old *nodemap=NULL;
2908 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2911 /* read the node flags from the recmaster */
2912 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2915 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2916 talloc_free(tmp_ctx);
2919 if (c->pnn >= nodemap->num) {
2920 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2921 talloc_free(tmp_ctx);
2925 /* send the flags update to all connected nodes */
2926 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2928 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2929 nodes, 0, CONTROL_TIMEOUT(),
2933 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2935 talloc_free(tmp_ctx);
2939 talloc_free(tmp_ctx);
2943 struct verify_recmode_normal_data {
2945 enum monitor_result status;
2948 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2950 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2953 /* one more node has responded with recmode data*/
2956 /* if we failed to get the recmode, then return an error and let
2957 the main loop try again.
2959 if (state->state != CTDB_CONTROL_DONE) {
2960 if (rmdata->status == MONITOR_OK) {
2961 rmdata->status = MONITOR_FAILED;
2966 /* if we got a response, then the recmode will be stored in the
2969 if (state->status != CTDB_RECOVERY_NORMAL) {
2970 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2971 rmdata->status = MONITOR_RECOVERY_NEEDED;
2978 /* verify that all nodes are in normal recovery mode */
2979 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2981 struct verify_recmode_normal_data *rmdata;
2982 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2983 struct ctdb_client_control_state *state;
2984 enum monitor_result status;
2987 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2988 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2990 rmdata->status = MONITOR_OK;
2992 /* loop over all active nodes and send an async getrecmode call to
2994 for (j=0; j<nodemap->num; j++) {
2995 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2998 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
3000 nodemap->nodes[j].pnn);
3001 if (state == NULL) {
3002 /* we failed to send the control, treat this as
3003 an error and try again next iteration
3005 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
3006 talloc_free(mem_ctx);
3007 return MONITOR_FAILED;
3010 /* set up the callback functions */
3011 state->async.fn = verify_recmode_normal_callback;
3012 state->async.private_data = rmdata;
3014 /* one more control to wait for to complete */
3019 /* now wait for up to the maximum number of seconds allowed
3020 or until all nodes we expect a response from has replied
3022 while (rmdata->count > 0) {
3023 tevent_loop_once(ctdb->ev);
3026 status = rmdata->status;
3027 talloc_free(mem_ctx);
3032 struct verify_recmaster_data {
3033 struct ctdb_recoverd *rec;
3036 enum monitor_result status;
3039 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3041 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3044 /* one more node has responded with recmaster data*/
3047 /* if we failed to get the recmaster, then return an error and let
3048 the main loop try again.
3050 if (state->state != CTDB_CONTROL_DONE) {
3051 if (rmdata->status == MONITOR_OK) {
3052 rmdata->status = MONITOR_FAILED;
3057 /* if we got a response, then the recmaster will be stored in the
3060 if (state->status != rmdata->pnn) {
3061 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3062 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3063 rmdata->status = MONITOR_ELECTION_NEEDED;
3070 /* verify that all nodes agree that we are the recmaster */
3071 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
3073 struct ctdb_context *ctdb = rec->ctdb;
3074 struct verify_recmaster_data *rmdata;
3075 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3076 struct ctdb_client_control_state *state;
3077 enum monitor_result status;
3080 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3081 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3085 rmdata->status = MONITOR_OK;
3087 /* loop over all active nodes and send an async getrecmaster call to
3089 for (j=0; j<nodemap->num; j++) {
3090 if (nodemap->nodes[j].pnn == rec->recmaster) {
3093 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3096 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3098 nodemap->nodes[j].pnn);
3099 if (state == NULL) {
3100 /* we failed to send the control, treat this as
3101 an error and try again next iteration
3103 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3104 talloc_free(mem_ctx);
3105 return MONITOR_FAILED;
3108 /* set up the callback functions */
3109 state->async.fn = verify_recmaster_callback;
3110 state->async.private_data = rmdata;
3112 /* one more control to wait for to complete */
3117 /* now wait for up to the maximum number of seconds allowed
3118 or until all nodes we expect a response from has replied
3120 while (rmdata->count > 0) {
3121 tevent_loop_once(ctdb->ev);
3124 status = rmdata->status;
3125 talloc_free(mem_ctx);
3129 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3130 struct ctdb_recoverd *rec)
3132 struct ctdb_iface_list_old *ifaces = NULL;
3133 TALLOC_CTX *mem_ctx;
3136 mem_ctx = talloc_new(NULL);
3138 /* Read the interfaces from the local node */
3139 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3140 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3141 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3142 /* We could return an error. However, this will be
3143 * rare so we'll decide that the interfaces have
3144 * actually changed, just in case.
3146 talloc_free(mem_ctx);
3151 /* We haven't been here before so things have changed */
3152 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3154 } else if (rec->ifaces->num != ifaces->num) {
3155 /* Number of interfaces has changed */
3156 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3157 rec->ifaces->num, ifaces->num));
3160 /* See if interface names or link states have changed */
3162 for (i = 0; i < rec->ifaces->num; i++) {
3163 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
3164 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3166 ("Interface in slot %d changed: %s => %s\n",
3167 i, iface->name, ifaces->ifaces[i].name));
3171 if (iface->link_state != ifaces->ifaces[i].link_state) {
3173 ("Interface %s changed state: %d => %d\n",
3174 iface->name, iface->link_state,
3175 ifaces->ifaces[i].link_state));
3182 talloc_free(rec->ifaces);
3183 rec->ifaces = talloc_steal(rec, ifaces);
3185 talloc_free(mem_ctx);
3189 /* Check that the local allocation of public IP addresses is correct
3190 * and do some house-keeping */
3191 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
3192 struct ctdb_recoverd *rec,
3194 struct ctdb_node_map_old *nodemap)
3196 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3198 bool need_takeover_run = false;
3199 struct ctdb_public_ip_list_old *ips = NULL;
3201 /* If we are not the recmaster then do some housekeeping */
3202 if (rec->recmaster != pnn) {
3203 /* Ignore any IP reallocate requests - only recmaster
3206 TALLOC_FREE(rec->reallocate_requests);
3207 /* Clear any nodes that should be force rebalanced in
3208 * the next takeover run. If the recovery master role
3209 * has moved then we don't want to process these some
3210 * time in the future.
3212 TALLOC_FREE(rec->force_rebalance_nodes);
3215 /* Return early if disabled... */
3216 if (ctdb->tunable.disable_ip_failover != 0 ||
3217 ctdb_op_is_disabled(rec->takeover_run)) {
3221 if (interfaces_have_changed(ctdb, rec)) {
3222 need_takeover_run = true;
3225 /* If there are unhosted IPs but this node can host them then
3226 * trigger an IP reallocation */
3228 /* Read *available* IPs from local node */
3229 ret = ctdb_ctrl_get_public_ips_flags(
3230 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
3231 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3233 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
3234 talloc_free(mem_ctx);
3238 for (j=0; j<ips->num; j++) {
3239 if (ips->ips[j].pnn == -1 &&
3240 nodemap->nodes[pnn].flags == 0) {
3241 DEBUG(DEBUG_WARNING,
3242 ("Unassigned IP %s can be served by this node\n",
3243 ctdb_addr_to_str(&ips->ips[j].addr)));
3244 need_takeover_run = true;
3250 if (!ctdb->do_checkpublicip) {
3254 /* Validate the IP addresses that this node has on network
3255 * interfaces. If there is an inconsistency between reality
3256 * and the state expected by CTDB then try to fix it by
3257 * triggering an IP reallocation or releasing extraneous IP
3260 /* Read *known* IPs from local node */
3261 ret = ctdb_ctrl_get_public_ips_flags(
3262 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3264 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
3265 talloc_free(mem_ctx);
3269 for (j=0; j<ips->num; j++) {
3270 if (ips->ips[j].pnn == pnn) {
3271 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
3273 ("Assigned IP %s not on an interface\n",
3274 ctdb_addr_to_str(&ips->ips[j].addr)));
3275 need_takeover_run = true;
3278 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
3280 ("IP %s incorrectly on an interface - releasing\n",
3281 ctdb_addr_to_str(&ips->ips[j].addr)));
3282 ret = ctdb_ctrl_release_ip(ctdb,
3288 ("Failed to release IP address\n"));
3295 if (need_takeover_run) {
3296 struct ctdb_srvid_message rd;
3299 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
3304 data.dptr = (uint8_t *)&rd;
3305 data.dsize = sizeof(rd);
3307 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3310 ("Failed to send takeover run request\n"));
3313 talloc_free(mem_ctx);
3318 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3320 struct ctdb_node_map_old **remote_nodemaps = callback_data;
3322 if (node_pnn >= ctdb->num_nodes) {
3323 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3327 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3331 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3332 struct ctdb_node_map_old *nodemap,
3333 struct ctdb_node_map_old **remote_nodemaps)
3337 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3338 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3340 CONTROL_TIMEOUT(), false, tdb_null,
3341 async_getnodemap_callback,
3343 remote_nodemaps) != 0) {
3344 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3352 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3354 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3355 const char *reclockfile;
3357 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3358 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3359 talloc_free(tmp_ctx);
3363 if (reclockfile == NULL) {
3364 if (ctdb->recovery_lock_file != NULL) {
3365 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3366 talloc_free(ctdb->recovery_lock_file);
3367 ctdb->recovery_lock_file = NULL;
3368 ctdb_recovery_unlock(ctdb);
3370 talloc_free(tmp_ctx);
3374 if (ctdb->recovery_lock_file == NULL) {
3376 ("Recovery lock file enabled (%s)\n", reclockfile));
3377 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3378 ctdb_recovery_unlock(ctdb);
3379 talloc_free(tmp_ctx);
3384 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3385 talloc_free(tmp_ctx);
3390 ("Recovery lock file changed (now %s)\n", reclockfile));
3391 talloc_free(ctdb->recovery_lock_file);
3392 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3393 ctdb_recovery_unlock(ctdb);
3395 talloc_free(tmp_ctx);
3399 static bool validate_recovery_master(struct ctdb_recoverd *rec,
3400 TALLOC_CTX *mem_ctx)
3402 struct ctdb_context *ctdb = rec->ctdb;
3403 uint32_t pnn = ctdb_get_pnn(ctdb);
3404 struct ctdb_node_map_old *nodemap = rec->nodemap;
3405 struct ctdb_node_map_old *recmaster_nodemap = NULL;
3408 /* When recovery daemon is started, recmaster is set to
3409 * "unknown" so it knows to start an election.
3411 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
3413 ("Initial recovery master set - forcing election\n"));
3414 force_election(rec, pnn, nodemap);
3419 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3420 * but we have, then force an election and try to become the new
3423 if (!ctdb_node_has_capabilities(rec->caps,
3425 CTDB_CAP_RECMASTER) &&
3426 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3427 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3429 (" Current recmaster node %u does not have CAP_RECMASTER,"
3430 " but we (node %u) have - force an election\n",
3431 rec->recmaster, pnn));
3432 force_election(rec, pnn, nodemap);
3436 /* Verify that the master node has not been deleted. This
3437 * should not happen because a node should always be shutdown
3438 * before being deleted, causing a new master to be elected
3439 * before now. However, if something strange has happened
3440 * then checking here will ensure we don't index beyond the
3441 * end of the nodemap array. */
3442 if (rec->recmaster >= nodemap->num) {
3444 ("Recmaster node %u has been deleted. Force election\n",
3446 force_election(rec, pnn, nodemap);
3450 /* if recovery master is disconnected/deleted we must elect a new recmaster */
3451 if (nodemap->nodes[rec->recmaster].flags &
3452 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
3454 ("Recmaster node %u is disconnected/deleted. Force election\n",
3456 force_election(rec, pnn, nodemap);
3460 /* get nodemap from the recovery master to check if it is inactive */
3461 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
3462 mem_ctx, &recmaster_nodemap);
3466 " Unable to get nodemap from recovery master %u\n",
3468 /* No election, just error */
3473 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
3474 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3476 ("Recmaster node %u is inactive. Force election\n",
3479 * update our nodemap to carry the recmaster's notion of
3480 * its own flags, so that we don't keep freezing the
3481 * inactive recmaster node...
3483 nodemap->nodes[rec->recmaster].flags =
3484 recmaster_nodemap->nodes[rec->recmaster].flags;
3485 force_election(rec, pnn, nodemap);
3492 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3493 TALLOC_CTX *mem_ctx)
3496 struct ctdb_node_map_old *nodemap=NULL;
3497 struct ctdb_node_map_old **remote_nodemaps=NULL;
3498 struct ctdb_vnn_map *vnnmap=NULL;
3499 struct ctdb_vnn_map *remote_vnnmap=NULL;
3500 uint32_t num_lmasters;
3501 int32_t debug_level;
3506 /* verify that the main daemon is still running */
3507 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3508 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3512 /* ping the local daemon to tell it we are alive */
3513 ctdb_ctrl_recd_ping(ctdb);
3515 if (rec->election_timeout) {
3516 /* an election is in progress */
3520 /* read the debug level from the parent and update locally */
3521 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3523 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3526 DEBUGLEVEL = debug_level;
3528 /* get relevant tunables */
3529 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3531 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3536 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3537 CTDB_CURRENT_NODE, &ctdb->runstate);
3539 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3543 /* get the current recovery lock file from the server */
3544 if (update_recovery_lock_file(ctdb) != 0) {
3545 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3549 pnn = ctdb_get_pnn(ctdb);
3552 TALLOC_FREE(rec->nodemap);
3553 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3555 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3558 nodemap = rec->nodemap;
3560 /* remember our own node flags */
3561 rec->node_flags = nodemap->nodes[pnn].flags;
3563 ban_misbehaving_nodes(rec, &self_ban);
3565 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3569 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3570 also frozen and that the recmode is set to active.
3572 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3573 /* If this node has become inactive then we want to
3574 * reduce the chances of it taking over the recovery
3575 * master role when it becomes active again. This
3576 * helps to stabilise the recovery master role so that
3577 * it stays on the most stable node.
3579 rec->priority_time = timeval_current();
3581 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3583 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3585 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3586 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3588 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3590 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3594 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3596 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3601 /* If this node is stopped or banned then it is not the recovery
3602 * master, so don't do anything. This prevents stopped or banned
3603 * node from starting election and sending unnecessary controls.
3608 /* Retrieve capabilities from all connected nodes */
3609 ret = update_capabilities(rec, nodemap);
3611 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3615 if (! validate_recovery_master(rec, mem_ctx)) {
3619 /* Check if an IP takeover run is needed and trigger one if
3621 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
3623 /* if we are not the recmaster then we do not need to check
3624 if recovery is needed
3626 if (pnn != rec->recmaster) {
3631 /* ensure our local copies of flags are right */
3632 ret = update_local_flags(rec, nodemap);
3634 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3638 if (ctdb->num_nodes != nodemap->num) {
3639 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3640 ctdb_load_nodes_file(ctdb);
3644 /* verify that all active nodes agree that we are the recmaster */
3645 switch (verify_recmaster(rec, nodemap, pnn)) {
3646 case MONITOR_RECOVERY_NEEDED:
3647 /* can not happen */
3649 case MONITOR_ELECTION_NEEDED:
3650 force_election(rec, pnn, nodemap);
3654 case MONITOR_FAILED:
3659 /* get the vnnmap */
3660 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3662 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3666 if (rec->need_recovery) {
3667 /* a previous recovery didn't finish */
3668 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3672 /* verify that all active nodes are in normal mode
3673 and not in recovery mode
3675 switch (verify_recmode(ctdb, nodemap)) {
3676 case MONITOR_RECOVERY_NEEDED:
3677 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3679 case MONITOR_FAILED:
3681 case MONITOR_ELECTION_NEEDED:
3682 /* can not happen */
3688 if (ctdb->recovery_lock_file != NULL) {
3689 /* We must already hold the recovery lock */
3690 if (!ctdb_recovery_have_lock(ctdb)) {
3691 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3692 ctdb_set_culprit(rec, ctdb->pnn);
3693 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3699 /* if there are takeovers requested, perform it and notify the waiters */
3700 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3701 rec->reallocate_requests) {
3702 process_ipreallocate_requests(ctdb, rec);
3705 /* If recoveries are disabled then there is no use doing any
3706 * nodemap or flags checks. Recoveries might be disabled due
3707 * to "reloadnodes", so doing these checks might cause an
3708 * unnecessary recovery. */
3709 if (ctdb_op_is_disabled(rec->recovery)) {
3713 /* get the nodemap for all active remote nodes
3715 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3716 if (remote_nodemaps == NULL) {
3717 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3720 for(i=0; i<nodemap->num; i++) {
3721 remote_nodemaps[i] = NULL;
3723 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3724 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3728 /* verify that all other nodes have the same nodemap as we have
3730 for (j=0; j<nodemap->num; j++) {
3731 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3735 if (remote_nodemaps[j] == NULL) {
3736 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3737 ctdb_set_culprit(rec, j);
3742 /* if the nodes disagree on how many nodes there are
3743 then this is a good reason to try recovery
3745 if (remote_nodemaps[j]->num != nodemap->num) {
3746 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3747 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3748 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3749 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3753 /* if the nodes disagree on which nodes exist and are
3754 active, then that is also a good reason to do recovery
3756 for (i=0;i<nodemap->num;i++) {
3757 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3758 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3759 nodemap->nodes[j].pnn, i,
3760 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3761 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3762 do_recovery(rec, mem_ctx, pnn, nodemap,
3770 * Update node flags obtained from each active node. This ensure we have
3771 * up-to-date information for all the nodes.
3773 for (j=0; j<nodemap->num; j++) {
3774 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3777 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3780 for (j=0; j<nodemap->num; j++) {
3781 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3785 /* verify the flags are consistent
3787 for (i=0; i<nodemap->num; i++) {
3788 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3792 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3793 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3794 nodemap->nodes[j].pnn,
3795 nodemap->nodes[i].pnn,
3796 remote_nodemaps[j]->nodes[i].flags,
3797 nodemap->nodes[i].flags));
3799 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3800 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3801 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3802 do_recovery(rec, mem_ctx, pnn, nodemap,
3806 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3807 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3808 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3809 do_recovery(rec, mem_ctx, pnn, nodemap,
3818 /* count how many active nodes there are */
3820 for (i=0; i<nodemap->num; i++) {
3821 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3822 if (ctdb_node_has_capabilities(rec->caps,
3823 ctdb->nodes[i]->pnn,
3824 CTDB_CAP_LMASTER)) {
3831 /* There must be the same number of lmasters in the vnn map as
3832 * there are active nodes with the lmaster capability... or
3835 if (vnnmap->size != num_lmasters) {
3836 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3837 vnnmap->size, num_lmasters));
3838 ctdb_set_culprit(rec, ctdb->pnn);
3839 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3843 /* verify that all active nodes in the nodemap also exist in
3846 for (j=0; j<nodemap->num; j++) {
3847 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3850 if (nodemap->nodes[j].pnn == pnn) {
3854 for (i=0; i<vnnmap->size; i++) {
3855 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3859 if (i == vnnmap->size) {
3860 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3861 nodemap->nodes[j].pnn));
3862 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3863 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3869 /* verify that all other nodes have the same vnnmap
3870 and are from the same generation
3872 for (j=0; j<nodemap->num; j++) {
3873 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3876 if (nodemap->nodes[j].pnn == pnn) {
3880 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3881 mem_ctx, &remote_vnnmap);
3883 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3884 nodemap->nodes[j].pnn));
3888 /* verify the vnnmap generation is the same */
3889 if (vnnmap->generation != remote_vnnmap->generation) {
3890 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3891 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3892 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3893 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3897 /* verify the vnnmap size is the same */
3898 if (vnnmap->size != remote_vnnmap->size) {
3899 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3900 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3901 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3902 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3906 /* verify the vnnmap is the same */
3907 for (i=0;i<vnnmap->size;i++) {
3908 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3909 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3910 nodemap->nodes[j].pnn));
3911 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3912 do_recovery(rec, mem_ctx, pnn, nodemap,
3919 /* we might need to change who has what IP assigned */
3920 if (rec->need_takeover_run) {
3921 /* If takeover run fails, then the offending nodes are
3922 * assigned ban culprit counts. And we re-try takeover.
3923 * If takeover run fails repeatedly, the node would get
3926 do_takeover_run(rec, nodemap, true);
3931 the main monitoring loop
3933 static void monitor_cluster(struct ctdb_context *ctdb)
3935 struct ctdb_recoverd *rec;
3937 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3939 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3940 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3943 rec->recmaster = CTDB_UNKNOWN_PNN;
3945 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3946 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3948 rec->recovery = ctdb_op_init(rec, "recoveries");
3949 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3951 rec->priority_time = timeval_current();
3953 /* register a message port for sending memory dumps */
3954 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3956 /* when a node is assigned banning credits */
3957 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3958 banning_handler, rec);
3960 /* register a message port for recovery elections */
3961 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3963 /* when nodes are disabled/enabled */
3964 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3966 /* when we are asked to puch out a flag change */
3967 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3969 /* register a message port for vacuum fetch */
3970 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3972 /* register a message port for reloadnodes */
3973 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3975 /* register a message port for performing a takeover run */
3976 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3978 /* register a message port for disabling the ip check for a short while */
3979 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3981 /* register a message port for updating the recovery daemons node assignment for an ip */
3982 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3984 /* register a message port for forcing a rebalance of a node next
3986 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3988 /* Register a message port for disabling takeover runs */
3989 ctdb_client_set_message_handler(ctdb,
3990 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3991 disable_takeover_runs_handler, rec);
3993 /* Register a message port for disabling recoveries */
3994 ctdb_client_set_message_handler(ctdb,
3995 CTDB_SRVID_DISABLE_RECOVERIES,
3996 disable_recoveries_handler, rec);
3998 /* register a message port for detaching database */
3999 ctdb_client_set_message_handler(ctdb,
4000 CTDB_SRVID_DETACH_DATABASE,
4001 detach_database_handler, rec);
4004 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4005 struct timeval start;
4009 DEBUG(DEBUG_CRIT,(__location__
4010 " Failed to create temp context\n"));
4014 start = timeval_current();
4015 main_loop(ctdb, rec, mem_ctx);
4016 talloc_free(mem_ctx);
4018 /* we only check for recovery once every second */
4019 elapsed = timeval_elapsed(&start);
4020 if (elapsed < ctdb->tunable.recover_interval) {
4021 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4028 event handler for when the main ctdbd dies
4030 static void ctdb_recoverd_parent(struct tevent_context *ev,
4031 struct tevent_fd *fde,
4032 uint16_t flags, void *private_data)
4034 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4039 called regularly to verify that the recovery daemon is still running
4041 static void ctdb_check_recd(struct tevent_context *ev,
4042 struct tevent_timer *te,
4043 struct timeval yt, void *p)
4045 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4047 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4048 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4050 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
4051 ctdb_restart_recd, ctdb);
4056 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4057 timeval_current_ofs(30, 0),
4058 ctdb_check_recd, ctdb);
4061 static void recd_sig_child_handler(struct tevent_context *ev,
4062 struct tevent_signal *se, int signum,
4063 int count, void *dont_care,
4066 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4071 pid = waitpid(-1, &status, WNOHANG);
4073 if (errno != ECHILD) {
4074 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4079 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4085 startup the recovery daemon as a child of the main ctdb daemon
4087 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4090 struct tevent_signal *se;
4091 struct tevent_fd *fde;
4093 if (pipe(fd) != 0) {
4097 ctdb->recoverd_pid = ctdb_fork(ctdb);
4098 if (ctdb->recoverd_pid == -1) {
4102 if (ctdb->recoverd_pid != 0) {
4103 talloc_free(ctdb->recd_ctx);
4104 ctdb->recd_ctx = talloc_new(ctdb);
4105 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4108 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4109 timeval_current_ofs(30, 0),
4110 ctdb_check_recd, ctdb);
4116 srandom(getpid() ^ time(NULL));
4118 prctl_set_comment("ctdb_recovered");
4119 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4120 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4124 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4126 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4127 ctdb_recoverd_parent, &fd[0]);
4128 tevent_fd_set_auto_close(fde);
4130 /* set up a handler to pick up sigchld */
4131 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4132 recd_sig_child_handler, ctdb);
4134 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4138 monitor_cluster(ctdb);
4140 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4145 shutdown the recovery daemon
4147 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4149 if (ctdb->recoverd_pid == 0) {
4153 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4154 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4156 TALLOC_FREE(ctdb->recd_ctx);
4157 TALLOC_FREE(ctdb->recd_ping_count);
4160 static void ctdb_restart_recd(struct tevent_context *ev,
4161 struct tevent_timer *te,
4162 struct timeval t, void *private_data)
4164 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4166 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4167 ctdb_stop_recoverd(ctdb);
4168 ctdb_start_recoverd(ctdb);