4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
49 struct srvid_list *next, *prev;
50 struct ctdb_srvid_message *request;
53 struct srvid_requests {
54 struct srvid_list *requests;
57 static void srvid_request_reply(struct ctdb_context *ctdb,
58 struct ctdb_srvid_message *request,
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request->srvid == 0) {
67 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
69 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request->pnn,
71 (unsigned long long)request->srvid));
73 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request->pnn,
75 (unsigned long long)request->srvid));
81 static void srvid_requests_reply(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
87 if (*requests == NULL) {
91 for (r = (*requests)->requests; r != NULL; r = r->next) {
92 srvid_request_reply(ctdb, r->request, result);
95 /* Free the list structure... */
96 TALLOC_FREE(*requests);
99 static void srvid_request_add(struct ctdb_context *ctdb,
100 struct srvid_requests **requests,
101 struct ctdb_srvid_message *request)
103 struct srvid_list *t;
107 if (*requests == NULL) {
108 *requests = talloc_zero(ctdb, struct srvid_requests);
109 if (*requests == NULL) {
114 t = talloc_zero(*requests, struct srvid_list);
116 /* If *requests was just allocated above then free it */
117 if ((*requests)->requests == NULL) {
118 TALLOC_FREE(*requests);
123 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
124 DLIST_ADD((*requests)->requests, t);
129 /* Failed to add the request to the list. Send a fail. */
130 DEBUG(DEBUG_ERR, (__location__
131 " Out of memory, failed to queue SRVID request\n"));
133 result.dsize = sizeof(ret);
134 result.dptr = (uint8_t *)&ret;
135 srvid_request_reply(ctdb, request, result);
138 /* An abstraction to allow an operation (takeover runs, recoveries,
139 * ...) to be disabled for a given timeout */
140 struct ctdb_op_state {
141 struct tevent_timer *timer;
146 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
148 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
151 state->in_progress = false;
158 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
160 return state->timer != NULL;
163 static bool ctdb_op_begin(struct ctdb_op_state *state)
165 if (ctdb_op_is_disabled(state)) {
167 ("Unable to begin - %s are disabled\n", state->name));
171 state->in_progress = true;
175 static bool ctdb_op_end(struct ctdb_op_state *state)
177 return state->in_progress = false;
180 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
182 return state->in_progress;
185 static void ctdb_op_enable(struct ctdb_op_state *state)
187 TALLOC_FREE(state->timer);
190 static void ctdb_op_timeout_handler(struct tevent_context *ev,
191 struct tevent_timer *te,
192 struct timeval yt, void *p)
194 struct ctdb_op_state *state =
195 talloc_get_type(p, struct ctdb_op_state);
197 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
198 ctdb_op_enable(state);
201 static int ctdb_op_disable(struct ctdb_op_state *state,
202 struct tevent_context *ev,
206 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
207 ctdb_op_enable(state);
211 if (state->in_progress) {
213 ("Unable to disable %s - in progress\n", state->name));
217 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
218 state->name, timeout));
220 /* Clear any old timers */
221 talloc_free(state->timer);
223 /* Arrange for the timeout to occur */
224 state->timer = tevent_add_timer(ev, state,
225 timeval_current_ofs(timeout, 0),
226 ctdb_op_timeout_handler, state);
227 if (state->timer == NULL) {
228 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
235 struct ctdb_banning_state {
237 struct timeval last_reported_time;
241 private state of recovery daemon
243 struct ctdb_recoverd {
244 struct ctdb_context *ctdb;
246 uint32_t last_culprit_node;
247 struct ctdb_node_map_old *nodemap;
248 struct timeval priority_time;
249 bool need_takeover_run;
252 struct tevent_timer *send_election_te;
253 struct tevent_timer *election_timeout;
254 struct srvid_requests *reallocate_requests;
255 struct ctdb_op_state *takeover_run;
256 struct ctdb_op_state *recovery;
257 struct ctdb_iface_list_old *ifaces;
258 uint32_t *force_rebalance_nodes;
259 struct ctdb_node_capabilities *caps;
260 bool frozen_on_inactive;
261 struct ctdb_cluster_mutex_handle *recovery_lock_handle;
264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
267 static void ctdb_restart_recd(struct tevent_context *ev,
268 struct tevent_timer *te, struct timeval t,
272 ban a node for a period of time
274 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
277 struct ctdb_context *ctdb = rec->ctdb;
278 struct ctdb_ban_state bantime;
280 if (!ctdb_validate_pnn(ctdb, pnn)) {
281 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
285 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
288 bantime.time = ban_time;
290 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
292 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
298 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
302 remember the trouble maker
304 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
306 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
307 struct ctdb_banning_state *ban_state;
309 if (culprit > ctdb->num_nodes) {
310 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
314 /* If we are banned or stopped, do not set other nodes as culprits */
315 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
316 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
320 if (ctdb->nodes[culprit]->ban_state == NULL) {
321 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
322 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
326 ban_state = ctdb->nodes[culprit]->ban_state;
327 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
328 /* this was the first time in a long while this node
329 misbehaved so we will forgive any old transgressions.
331 ban_state->count = 0;
334 ban_state->count += count;
335 ban_state->last_reported_time = timeval_current();
336 rec->last_culprit_node = culprit;
340 remember the trouble maker
342 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
344 ctdb_set_culprit_count(rec, culprit, 1);
348 Retrieve capabilities from all connected nodes
350 static int update_capabilities(struct ctdb_recoverd *rec,
351 struct ctdb_node_map_old *nodemap)
355 struct ctdb_node_capabilities *caps;
356 struct ctdb_context *ctdb = rec->ctdb;
358 tmp_ctx = talloc_new(rec);
359 CTDB_NO_MEMORY(ctdb, tmp_ctx);
361 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
362 CONTROL_TIMEOUT(), nodemap);
366 (__location__ " Failed to get node capabilities\n"));
367 talloc_free(tmp_ctx);
371 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
375 " Capabilities don't include current node.\n"));
376 talloc_free(tmp_ctx);
379 ctdb->capabilities = *capp;
381 TALLOC_FREE(rec->caps);
382 rec->caps = talloc_steal(rec, caps);
384 talloc_free(tmp_ctx);
389 change recovery mode on all nodes
391 static int set_recovery_mode(struct ctdb_context *ctdb,
392 struct ctdb_recoverd *rec,
393 struct ctdb_node_map_old *nodemap,
400 tmp_ctx = talloc_new(ctdb);
401 CTDB_NO_MEMORY(ctdb, tmp_ctx);
403 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
405 data.dsize = sizeof(uint32_t);
406 data.dptr = (unsigned char *)&rec_mode;
408 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
414 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
415 talloc_free(tmp_ctx);
419 talloc_free(tmp_ctx);
424 ensure all other nodes have attached to any databases that we have
426 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
427 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
430 struct ctdb_dbid_map_old *remote_dbmap;
432 /* verify that all other nodes have all our databases */
433 for (j=0; j<nodemap->num; j++) {
434 /* we don't need to ourself ourselves */
435 if (nodemap->nodes[j].pnn == pnn) {
438 /* don't check nodes that are unavailable */
439 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
443 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
444 mem_ctx, &remote_dbmap);
446 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
450 /* step through all local databases */
451 for (db=0; db<dbmap->num;db++) {
455 for (i=0;i<remote_dbmap->num;i++) {
456 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
460 /* the remote node already have this database */
461 if (i!=remote_dbmap->num) {
464 /* ok so we need to create this database */
465 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
466 dbmap->dbs[db].db_id, mem_ctx,
469 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
472 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
473 nodemap->nodes[j].pnn,
475 dbmap->dbs[db].flags, NULL);
477 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
488 ensure we are attached to any databases that anyone else is attached to
490 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
491 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
494 struct ctdb_dbid_map_old *remote_dbmap;
496 /* verify that we have all database any other node has */
497 for (j=0; j<nodemap->num; j++) {
498 /* we don't need to ourself ourselves */
499 if (nodemap->nodes[j].pnn == pnn) {
502 /* don't check nodes that are unavailable */
503 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
507 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
508 mem_ctx, &remote_dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
514 /* step through all databases on the remote node */
515 for (db=0; db<remote_dbmap->num;db++) {
518 for (i=0;i<(*dbmap)->num;i++) {
519 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
523 /* we already have this db locally */
524 if (i!=(*dbmap)->num) {
527 /* ok so we need to create this database and
530 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
531 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
533 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
534 nodemap->nodes[j].pnn));
537 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
539 remote_dbmap->dbs[db].flags, NULL);
541 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
544 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
546 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
556 update flags on all active nodes
558 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
562 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
564 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
572 called when a vacuum fetch has completed - just free it and do the next one
574 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
581 * Process one elements of the vacuum fetch list:
582 * Migrate it over to us with the special flag
583 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
585 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
587 struct ctdb_rec_data_old *r)
589 struct ctdb_client_call_state *state;
591 struct ctdb_ltdb_header *hdr;
592 struct ctdb_call call;
595 call.call_id = CTDB_NULL_FUNC;
596 call.flags = CTDB_IMMEDIATE_MIGRATION;
597 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
599 call.key.dptr = &r->data[0];
600 call.key.dsize = r->keylen;
602 /* ensure we don't block this daemon - just skip a record if we can't get
604 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
608 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
609 if (data.dptr == NULL) {
610 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
614 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
616 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
620 hdr = (struct ctdb_ltdb_header *)data.dptr;
621 if (hdr->dmaster == pnn) {
622 /* its already local */
624 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
630 state = ctdb_call_send(ctdb_db, &call);
631 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
633 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
636 state->async.fn = vacuum_fetch_callback;
637 state->async.private_data = NULL;
644 handler for vacuum fetch
646 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
649 struct ctdb_recoverd *rec = talloc_get_type(
650 private_data, struct ctdb_recoverd);
651 struct ctdb_context *ctdb = rec->ctdb;
652 struct ctdb_marshall_buffer *recs;
654 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
656 struct ctdb_dbid_map_old *dbmap=NULL;
657 uint8_t db_flags = 0;
658 struct ctdb_db_context *ctdb_db;
659 struct ctdb_rec_data_old *r;
661 recs = (struct ctdb_marshall_buffer *)data.dptr;
663 if (recs->count == 0) {
667 /* work out if the database is persistent */
668 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
670 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
674 for (i=0;i<dbmap->num;i++) {
675 if (dbmap->dbs[i].db_id == recs->db_id) {
676 db_flags = dbmap->dbs[i].flags;
680 if (i == dbmap->num) {
681 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
685 /* find the name of this database */
686 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
687 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
692 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
693 if (ctdb_db == NULL) {
694 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
698 r = (struct ctdb_rec_data_old *)&recs->data[0];
699 while (recs->count) {
702 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
707 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
712 talloc_free(tmp_ctx);
717 * handler for database detach
719 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
722 struct ctdb_recoverd *rec = talloc_get_type(
723 private_data, struct ctdb_recoverd);
724 struct ctdb_context *ctdb = rec->ctdb;
726 struct ctdb_db_context *ctdb_db;
728 if (data.dsize != sizeof(db_id)) {
731 db_id = *(uint32_t *)data.dptr;
733 ctdb_db = find_ctdb_db(ctdb, db_id);
734 if (ctdb_db == NULL) {
735 /* database is not attached */
739 DLIST_REMOVE(ctdb->db_list, ctdb_db);
741 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
743 talloc_free(ctdb_db);
747 called when ctdb_wait_timeout should finish
749 static void ctdb_wait_handler(struct tevent_context *ev,
750 struct tevent_timer *te,
751 struct timeval yt, void *p)
753 uint32_t *timed_out = (uint32_t *)p;
758 wait for a given number of seconds
760 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
762 uint32_t timed_out = 0;
763 time_t usecs = (secs - (time_t)secs) * 1000000;
764 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
765 ctdb_wait_handler, &timed_out);
767 tevent_loop_once(ctdb->ev);
772 called when an election times out (ends)
774 static void ctdb_election_timeout(struct tevent_context *ev,
775 struct tevent_timer *te,
776 struct timeval t, void *p)
778 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
779 rec->election_timeout = NULL;
782 DEBUG(DEBUG_WARNING,("Election period ended\n"));
787 wait for an election to finish. It finished election_timeout seconds after
788 the last election packet is received
790 static void ctdb_wait_election(struct ctdb_recoverd *rec)
792 struct ctdb_context *ctdb = rec->ctdb;
793 while (rec->election_timeout) {
794 tevent_loop_once(ctdb->ev);
799 Update our local flags from all remote connected nodes.
800 This is only run when we are or we belive we are the recovery master
802 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
805 struct ctdb_context *ctdb = rec->ctdb;
806 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
808 /* get the nodemap for all active remote nodes and verify
809 they are the same as for this node
811 for (j=0; j<nodemap->num; j++) {
812 struct ctdb_node_map_old *remote_nodemap=NULL;
815 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
818 if (nodemap->nodes[j].pnn == ctdb->pnn) {
822 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
823 mem_ctx, &remote_nodemap);
825 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
826 nodemap->nodes[j].pnn));
827 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
828 talloc_free(mem_ctx);
831 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
832 /* We should tell our daemon about this so it
833 updates its flags or else we will log the same
834 message again in the next iteration of recovery.
835 Since we are the recovery master we can just as
836 well update the flags on all nodes.
838 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
840 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
844 /* Update our local copy of the flags in the recovery
847 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
848 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
849 nodemap->nodes[j].flags));
850 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
852 talloc_free(remote_nodemap);
854 talloc_free(mem_ctx);
859 /* Create a new random generation id.
860 The generation id can not be the INVALID_GENERATION id
862 static uint32_t new_generation(void)
867 generation = random();
869 if (generation != INVALID_GENERATION) {
877 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
879 return (rec->recovery_lock_handle != NULL);
882 struct hold_reclock_state {
888 static void take_reclock_handler(char status,
892 struct hold_reclock_state *s =
893 (struct hold_reclock_state *) private_data;
897 s->latency = latency;
902 ("Unable to take recovery lock - contention\n"));
906 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
910 s->locked = (status == '0') ;
913 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
915 static void lost_reclock_handler(void *private_data)
917 struct ctdb_recoverd *rec = talloc_get_type_abort(
918 private_data, struct ctdb_recoverd);
921 ("Recovery lock helper terminated unexpectedly - "
922 "trying to retake recovery lock\n"));
923 TALLOC_FREE(rec->recovery_lock_handle);
924 if (! ctdb_recovery_lock(rec)) {
925 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
929 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
931 struct ctdb_context *ctdb = rec->ctdb;
932 struct ctdb_cluster_mutex_handle *h;
933 struct hold_reclock_state s = {
939 h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
940 take_reclock_handler, &s,
941 lost_reclock_handler, rec);
947 tevent_loop_once(ctdb->ev);
955 rec->recovery_lock_handle = h;
956 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
962 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
964 if (rec->recovery_lock_handle != NULL) {
965 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
966 TALLOC_FREE(rec->recovery_lock_handle);
970 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
972 struct ctdb_context *ctdb = rec->ctdb;
974 struct ctdb_banning_state *ban_state;
977 for (i=0; i<ctdb->num_nodes; i++) {
978 if (ctdb->nodes[i]->ban_state == NULL) {
981 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
982 if (ban_state->count < 2*ctdb->num_nodes) {
986 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
987 ctdb->nodes[i]->pnn, ban_state->count,
988 ctdb->tunable.recovery_ban_period));
989 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
990 ban_state->count = 0;
992 /* Banning ourself? */
993 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
999 struct helper_state {
1006 static void helper_handler(struct tevent_context *ev,
1007 struct tevent_fd *fde,
1008 uint16_t flags, void *private_data)
1010 struct helper_state *state = talloc_get_type_abort(
1011 private_data, struct helper_state);
1014 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1015 if (ret != sizeof(state->result)) {
1016 state->result = EPIPE;
1022 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1023 const char *prog, const char *arg, const char *type)
1025 struct helper_state *state;
1026 struct tevent_fd *fde;
1029 uint32_t recmaster = rec->recmaster;
1031 state = talloc_zero(mem_ctx, struct helper_state);
1032 if (state == NULL) {
1033 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1039 ret = pipe(state->fd);
1042 ("Failed to create pipe for %s helper\n", type));
1046 set_close_on_exec(state->fd[0]);
1049 args = talloc_array(state, const char *, nargs);
1051 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1055 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1056 if (args[0] == NULL) {
1057 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1060 args[1] = rec->ctdb->daemon.name;
1064 if (args[2] == NULL) {
1068 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1069 if (state->pid == -1) {
1071 ("Failed to create child for %s helper\n", type));
1075 close(state->fd[1]);
1078 state->done = false;
1080 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1081 TEVENT_FD_READ, helper_handler, state);
1085 tevent_fd_set_auto_close(fde);
1087 while (!state->done) {
1088 tevent_loop_once(rec->ctdb->ev);
1090 /* If recmaster changes, we have lost election */
1091 if (recmaster != rec->recmaster) {
1092 D_ERR("Recmaster changed to %u, aborting %s\n",
1093 rec->recmaster, type);
1099 close(state->fd[0]);
1102 if (state->result != 0) {
1106 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1111 if (state->fd[0] != -1) {
1112 close(state->fd[0]);
1114 if (state->fd[1] != -1) {
1115 close(state->fd[1]);
1117 if (state->pid != -1) {
1118 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1125 static int ctdb_takeover(struct ctdb_recoverd *rec,
1126 uint32_t *force_rebalance_nodes)
1128 static char prog[PATH_MAX+1] = "";
1132 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1133 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1134 "ctdb_takeover_helper")) {
1135 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1139 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1140 uint32_t pnn = force_rebalance_nodes[i];
1142 arg = talloc_asprintf(rec, "%u", pnn);
1144 arg = talloc_asprintf_append(arg, ",%u", pnn);
1147 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1152 return helper_run(rec, rec, prog, arg, "takeover");
1155 static bool do_takeover_run(struct ctdb_recoverd *rec,
1156 struct ctdb_node_map_old *nodemap)
1158 uint32_t *nodes = NULL;
1159 struct ctdb_disable_message dtr;
1162 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1166 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1168 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1169 DEBUG(DEBUG_ERR, (__location__
1170 " takeover run already in progress \n"));
1175 if (!ctdb_op_begin(rec->takeover_run)) {
1180 /* Disable IP checks (takeover runs, really) on other nodes
1181 * while doing this takeover run. This will stop those other
1182 * nodes from triggering takeover runs when think they should
1183 * be hosting an IP but it isn't yet on an interface. Don't
1184 * wait for replies since a failure here might cause some
1185 * noise in the logs but will not actually cause a problem.
1188 dtr.srvid = 0; /* No reply */
1191 data.dptr = (uint8_t*)&dtr;
1192 data.dsize = sizeof(dtr);
1194 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1196 /* Disable for 60 seconds. This can be a tunable later if
1200 for (i = 0; i < talloc_array_length(nodes); i++) {
1201 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1202 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1204 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1208 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1210 /* Reenable takeover runs and IP checks on other nodes */
1212 for (i = 0; i < talloc_array_length(nodes); i++) {
1213 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1214 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1216 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1221 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1227 /* Takeover run was successful so clear force rebalance targets */
1228 if (rebalance_nodes == rec->force_rebalance_nodes) {
1229 TALLOC_FREE(rec->force_rebalance_nodes);
1231 DEBUG(DEBUG_WARNING,
1232 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1235 rec->need_takeover_run = !ok;
1237 ctdb_op_end(rec->takeover_run);
1239 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1243 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1245 static char prog[PATH_MAX+1] = "";
1248 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1249 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1250 "ctdb_recovery_helper")) {
1251 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1254 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1256 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1260 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1262 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1266 we are the recmaster, and recovery is needed - start a recovery run
1268 static int do_recovery(struct ctdb_recoverd *rec,
1269 TALLOC_CTX *mem_ctx, uint32_t pnn,
1270 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1272 struct ctdb_context *ctdb = rec->ctdb;
1274 struct ctdb_dbid_map_old *dbmap;
1277 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1279 /* Check if the current node is still the recmaster. It's possible that
1280 * re-election has changed the recmaster.
1282 if (pnn != rec->recmaster) {
1284 ("Recovery master changed to %u, aborting recovery\n",
1289 /* if recovery fails, force it again */
1290 rec->need_recovery = true;
1292 if (!ctdb_op_begin(rec->recovery)) {
1296 if (rec->election_timeout) {
1297 /* an election is in progress */
1298 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1302 ban_misbehaving_nodes(rec, &self_ban);
1304 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1308 if (ctdb->recovery_lock != NULL) {
1309 if (ctdb_recovery_have_lock(rec)) {
1310 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1312 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1313 ctdb->recovery_lock));
1314 if (!ctdb_recovery_lock(rec)) {
1315 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1316 /* If ctdb is trying first recovery, it's
1317 * possible that current node does not know
1318 * yet who the recmaster is.
1320 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1321 " - retrying recovery\n"));
1325 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1326 "and ban ourself for %u seconds\n",
1327 ctdb->tunable.recovery_ban_period));
1328 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1332 ("Recovery lock taken successfully by recovery daemon\n"));
1336 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1338 /* get a list of all databases */
1339 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1341 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1345 /* we do the db creation before we set the recovery mode, so the freeze happens
1346 on all databases we will be dealing with. */
1348 /* verify that we have all the databases any other node has */
1349 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1351 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1355 /* verify that all other nodes have all our databases */
1356 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1358 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1361 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1364 /* Retrieve capabilities from all connected nodes */
1365 ret = update_capabilities(rec, nodemap);
1367 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1372 update all nodes to have the same flags that we have
1374 for (i=0;i<nodemap->num;i++) {
1375 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1379 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1381 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1382 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1384 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1390 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1392 ret = db_recovery_parallel(rec, mem_ctx);
1397 do_takeover_run(rec, nodemap);
1399 /* send a message to all clients telling them that the cluster
1400 has been reconfigured */
1401 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1402 CTDB_SRVID_RECONFIGURE, tdb_null);
1404 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1408 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1410 rec->need_recovery = false;
1411 ctdb_op_end(rec->recovery);
1413 /* we managed to complete a full recovery, make sure to forgive
1414 any past sins by the nodes that could now participate in the
1417 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1418 for (i=0;i<nodemap->num;i++) {
1419 struct ctdb_banning_state *ban_state;
1421 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1425 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1426 if (ban_state == NULL) {
1430 ban_state->count = 0;
1433 /* We just finished a recovery successfully.
1434 We now wait for rerecovery_timeout before we allow
1435 another recovery to take place.
1437 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1438 ctdb_op_disable(rec->recovery, ctdb->ev,
1439 ctdb->tunable.rerecovery_timeout);
1443 ctdb_op_end(rec->recovery);
1449 elections are won by first checking the number of connected nodes, then
1450 the priority time, then the pnn
1452 struct election_message {
1453 uint32_t num_connected;
1454 struct timeval priority_time;
1456 uint32_t node_flags;
1460 form this nodes election data
1462 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1465 struct ctdb_node_map_old *nodemap;
1466 struct ctdb_context *ctdb = rec->ctdb;
1470 em->pnn = rec->ctdb->pnn;
1471 em->priority_time = rec->priority_time;
1473 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1475 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1479 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1480 em->node_flags = rec->node_flags;
1482 for (i=0;i<nodemap->num;i++) {
1483 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1484 em->num_connected++;
1488 /* we shouldnt try to win this election if we cant be a recmaster */
1489 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1490 em->num_connected = 0;
1491 em->priority_time = timeval_current();
1494 talloc_free(nodemap);
1498 see if the given election data wins
1500 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1502 struct election_message myem;
1505 ctdb_election_data(rec, &myem);
1507 /* we cant win if we don't have the recmaster capability */
1508 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1512 /* we cant win if we are banned */
1513 if (rec->node_flags & NODE_FLAGS_BANNED) {
1517 /* we cant win if we are stopped */
1518 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1522 /* we will automatically win if the other node is banned */
1523 if (em->node_flags & NODE_FLAGS_BANNED) {
1527 /* we will automatically win if the other node is banned */
1528 if (em->node_flags & NODE_FLAGS_STOPPED) {
1532 /* then the longest running node */
1534 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1538 cmp = (int)myem.pnn - (int)em->pnn;
1545 send out an election request
1547 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1550 TDB_DATA election_data;
1551 struct election_message emsg;
1553 struct ctdb_context *ctdb = rec->ctdb;
1555 srvid = CTDB_SRVID_ELECTION;
1557 ctdb_election_data(rec, &emsg);
1559 election_data.dsize = sizeof(struct election_message);
1560 election_data.dptr = (unsigned char *)&emsg;
1563 /* first we assume we will win the election and set
1564 recoverymaster to be ourself on the current node
1566 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1567 CTDB_CURRENT_NODE, pnn);
1569 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1572 rec->recmaster = pnn;
1574 /* send an election message to all active nodes */
1575 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1576 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1580 we think we are winning the election - send a broadcast election request
1582 static void election_send_request(struct tevent_context *ev,
1583 struct tevent_timer *te,
1584 struct timeval t, void *p)
1586 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1589 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1591 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1594 TALLOC_FREE(rec->send_election_te);
1598 handler for memory dumps
1600 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1602 struct ctdb_recoverd *rec = talloc_get_type(
1603 private_data, struct ctdb_recoverd);
1604 struct ctdb_context *ctdb = rec->ctdb;
1605 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1608 struct ctdb_srvid_message *rd;
1610 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1611 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1612 talloc_free(tmp_ctx);
1615 rd = (struct ctdb_srvid_message *)data.dptr;
1617 dump = talloc_zero(tmp_ctx, TDB_DATA);
1619 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1620 talloc_free(tmp_ctx);
1623 ret = ctdb_dump_memory(ctdb, dump);
1625 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1626 talloc_free(tmp_ctx);
1630 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1632 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1634 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1635 talloc_free(tmp_ctx);
1639 talloc_free(tmp_ctx);
1643 handler for reload_nodes
1645 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1648 struct ctdb_recoverd *rec = talloc_get_type(
1649 private_data, struct ctdb_recoverd);
1651 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1653 ctdb_load_nodes_file(rec->ctdb);
1657 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1660 struct ctdb_recoverd *rec = talloc_get_type(
1661 private_data, struct ctdb_recoverd);
1662 struct ctdb_context *ctdb = rec->ctdb;
1667 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1671 if (data.dsize != sizeof(uint32_t)) {
1672 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1676 pnn = *(uint32_t *)&data.dptr[0];
1678 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1680 /* Copy any existing list of nodes. There's probably some
1681 * sort of realloc variant that will do this but we need to
1682 * make sure that freeing the old array also cancels the timer
1683 * event for the timeout... not sure if realloc will do that.
1685 len = (rec->force_rebalance_nodes != NULL) ?
1686 talloc_array_length(rec->force_rebalance_nodes) :
1689 /* This allows duplicates to be added but they don't cause
1690 * harm. A call to add a duplicate PNN arguably means that
1691 * the timeout should be reset, so this is the simplest
1694 t = talloc_zero_array(rec, uint32_t, len+1);
1695 CTDB_NO_MEMORY_VOID(ctdb, t);
1697 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1701 talloc_free(rec->force_rebalance_nodes);
1703 rec->force_rebalance_nodes = t;
1708 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1710 struct ctdb_op_state *op_state)
1712 struct ctdb_disable_message *r;
1717 /* Validate input data */
1718 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1719 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1720 "expecting %lu\n", (long unsigned)data.dsize,
1721 (long unsigned)sizeof(struct ctdb_srvid_message)));
1724 if (data.dptr == NULL) {
1725 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1729 r = (struct ctdb_disable_message *)data.dptr;
1730 timeout = r->timeout;
1732 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1737 /* Returning our PNN tells the caller that we succeeded */
1738 ret = ctdb_get_pnn(ctdb);
1740 result.dsize = sizeof(int32_t);
1741 result.dptr = (uint8_t *)&ret;
1742 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1745 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1748 struct ctdb_recoverd *rec = talloc_get_type(
1749 private_data, struct ctdb_recoverd);
1751 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1754 /* Backward compatibility for this SRVID */
1755 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1758 struct ctdb_recoverd *rec = talloc_get_type(
1759 private_data, struct ctdb_recoverd);
1762 if (data.dsize != sizeof(uint32_t)) {
1763 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1764 "expecting %lu\n", (long unsigned)data.dsize,
1765 (long unsigned)sizeof(uint32_t)));
1768 if (data.dptr == NULL) {
1769 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1773 timeout = *((uint32_t *)data.dptr);
1775 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1778 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1781 struct ctdb_recoverd *rec = talloc_get_type(
1782 private_data, struct ctdb_recoverd);
1784 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1788 handler for ip reallocate, just add it to the list of requests and
1789 handle this later in the monitor_cluster loop so we do not recurse
1790 with other requests to takeover_run()
1792 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1795 struct ctdb_srvid_message *request;
1796 struct ctdb_recoverd *rec = talloc_get_type(
1797 private_data, struct ctdb_recoverd);
1799 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1800 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1804 request = (struct ctdb_srvid_message *)data.dptr;
1806 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1809 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1810 struct ctdb_recoverd *rec)
1814 struct srvid_requests *current;
1816 /* Only process requests that are currently pending. More
1817 * might come in while the takeover run is in progress and
1818 * they will need to be processed later since they might
1819 * be in response flag changes.
1821 current = rec->reallocate_requests;
1822 rec->reallocate_requests = NULL;
1824 if (do_takeover_run(rec, rec->nodemap)) {
1825 ret = ctdb_get_pnn(ctdb);
1830 result.dsize = sizeof(int32_t);
1831 result.dptr = (uint8_t *)&ret;
1833 srvid_requests_reply(ctdb, ¤t, result);
1837 * handler for assigning banning credits
1839 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1841 struct ctdb_recoverd *rec = talloc_get_type(
1842 private_data, struct ctdb_recoverd);
1845 /* Ignore if we are not recmaster */
1846 if (rec->ctdb->pnn != rec->recmaster) {
1850 if (data.dsize != sizeof(uint32_t)) {
1851 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1856 ban_pnn = *(uint32_t *)data.dptr;
1858 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1862 handler for recovery master elections
1864 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1866 struct ctdb_recoverd *rec = talloc_get_type(
1867 private_data, struct ctdb_recoverd);
1868 struct ctdb_context *ctdb = rec->ctdb;
1870 struct election_message *em = (struct election_message *)data.dptr;
1872 /* Ignore election packets from ourself */
1873 if (ctdb->pnn == em->pnn) {
1877 /* we got an election packet - update the timeout for the election */
1878 talloc_free(rec->election_timeout);
1879 rec->election_timeout = tevent_add_timer(
1882 timeval_current_ofs(0, 500000) :
1883 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1884 ctdb_election_timeout, rec);
1886 /* someone called an election. check their election data
1887 and if we disagree and we would rather be the elected node,
1888 send a new election message to all other nodes
1890 if (ctdb_election_win(rec, em)) {
1891 if (!rec->send_election_te) {
1892 rec->send_election_te = tevent_add_timer(
1894 timeval_current_ofs(0, 500000),
1895 election_send_request, rec);
1901 TALLOC_FREE(rec->send_election_te);
1903 /* Release the recovery lock file */
1904 if (ctdb_recovery_have_lock(rec)) {
1905 ctdb_recovery_unlock(rec);
1908 /* ok, let that guy become recmaster then */
1909 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1910 CTDB_CURRENT_NODE, em->pnn);
1912 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1915 rec->recmaster = em->pnn;
1922 force the start of the election process
1924 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1925 struct ctdb_node_map_old *nodemap)
1928 struct ctdb_context *ctdb = rec->ctdb;
1930 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1932 /* set all nodes to recovery mode to stop all internode traffic */
1933 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1935 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1939 talloc_free(rec->election_timeout);
1940 rec->election_timeout = tevent_add_timer(
1943 timeval_current_ofs(0, 500000) :
1944 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1945 ctdb_election_timeout, rec);
1947 ret = send_election_request(rec, pnn);
1949 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1953 /* wait for a few seconds to collect all responses */
1954 ctdb_wait_election(rec);
1960 handler for when a node changes its flags
1962 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1964 struct ctdb_recoverd *rec = talloc_get_type(
1965 private_data, struct ctdb_recoverd);
1966 struct ctdb_context *ctdb = rec->ctdb;
1968 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1969 struct ctdb_node_map_old *nodemap=NULL;
1970 TALLOC_CTX *tmp_ctx;
1973 if (data.dsize != sizeof(*c)) {
1974 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1978 tmp_ctx = talloc_new(ctdb);
1979 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1981 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1983 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1984 talloc_free(tmp_ctx);
1989 for (i=0;i<nodemap->num;i++) {
1990 if (nodemap->nodes[i].pnn == c->pnn) break;
1993 if (i == nodemap->num) {
1994 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1995 talloc_free(tmp_ctx);
1999 if (c->old_flags != c->new_flags) {
2000 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2003 nodemap->nodes[i].flags = c->new_flags;
2005 talloc_free(tmp_ctx);
2009 handler for when we need to push out flag changes ot all other nodes
2011 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2014 struct ctdb_recoverd *rec = talloc_get_type(
2015 private_data, struct ctdb_recoverd);
2016 struct ctdb_context *ctdb = rec->ctdb;
2018 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2019 struct ctdb_node_map_old *nodemap=NULL;
2020 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2023 /* read the node flags from the recmaster */
2024 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2027 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2028 talloc_free(tmp_ctx);
2031 if (c->pnn >= nodemap->num) {
2032 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2033 talloc_free(tmp_ctx);
2037 /* send the flags update to all connected nodes */
2038 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2040 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2041 nodes, 0, CONTROL_TIMEOUT(),
2045 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2047 talloc_free(tmp_ctx);
2051 talloc_free(tmp_ctx);
2055 struct verify_recmode_normal_data {
2057 enum monitor_result status;
2060 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2062 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2065 /* one more node has responded with recmode data*/
2068 /* if we failed to get the recmode, then return an error and let
2069 the main loop try again.
2071 if (state->state != CTDB_CONTROL_DONE) {
2072 if (rmdata->status == MONITOR_OK) {
2073 rmdata->status = MONITOR_FAILED;
2078 /* if we got a response, then the recmode will be stored in the
2081 if (state->status != CTDB_RECOVERY_NORMAL) {
2082 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2083 rmdata->status = MONITOR_RECOVERY_NEEDED;
2090 /* verify that all nodes are in normal recovery mode */
2091 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2093 struct verify_recmode_normal_data *rmdata;
2094 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2095 struct ctdb_client_control_state *state;
2096 enum monitor_result status;
2099 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2100 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2102 rmdata->status = MONITOR_OK;
2104 /* loop over all active nodes and send an async getrecmode call to
2106 for (j=0; j<nodemap->num; j++) {
2107 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2110 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2112 nodemap->nodes[j].pnn);
2113 if (state == NULL) {
2114 /* we failed to send the control, treat this as
2115 an error and try again next iteration
2117 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2118 talloc_free(mem_ctx);
2119 return MONITOR_FAILED;
2122 /* set up the callback functions */
2123 state->async.fn = verify_recmode_normal_callback;
2124 state->async.private_data = rmdata;
2126 /* one more control to wait for to complete */
2131 /* now wait for up to the maximum number of seconds allowed
2132 or until all nodes we expect a response from has replied
2134 while (rmdata->count > 0) {
2135 tevent_loop_once(ctdb->ev);
2138 status = rmdata->status;
2139 talloc_free(mem_ctx);
2144 struct verify_recmaster_data {
2145 struct ctdb_recoverd *rec;
2148 enum monitor_result status;
2151 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2153 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2156 /* one more node has responded with recmaster data*/
2159 /* if we failed to get the recmaster, then return an error and let
2160 the main loop try again.
2162 if (state->state != CTDB_CONTROL_DONE) {
2163 if (rmdata->status == MONITOR_OK) {
2164 rmdata->status = MONITOR_FAILED;
2169 /* if we got a response, then the recmaster will be stored in the
2172 if (state->status != rmdata->pnn) {
2173 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2174 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2175 rmdata->status = MONITOR_ELECTION_NEEDED;
2182 /* verify that all nodes agree that we are the recmaster */
2183 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2185 struct ctdb_context *ctdb = rec->ctdb;
2186 struct verify_recmaster_data *rmdata;
2187 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2188 struct ctdb_client_control_state *state;
2189 enum monitor_result status;
2192 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2193 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2197 rmdata->status = MONITOR_OK;
2199 /* loop over all active nodes and send an async getrecmaster call to
2201 for (j=0; j<nodemap->num; j++) {
2202 if (nodemap->nodes[j].pnn == rec->recmaster) {
2205 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2208 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2210 nodemap->nodes[j].pnn);
2211 if (state == NULL) {
2212 /* we failed to send the control, treat this as
2213 an error and try again next iteration
2215 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2216 talloc_free(mem_ctx);
2217 return MONITOR_FAILED;
2220 /* set up the callback functions */
2221 state->async.fn = verify_recmaster_callback;
2222 state->async.private_data = rmdata;
2224 /* one more control to wait for to complete */
2229 /* now wait for up to the maximum number of seconds allowed
2230 or until all nodes we expect a response from has replied
2232 while (rmdata->count > 0) {
2233 tevent_loop_once(ctdb->ev);
2236 status = rmdata->status;
2237 talloc_free(mem_ctx);
2241 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2242 struct ctdb_recoverd *rec)
2244 struct ctdb_iface_list_old *ifaces = NULL;
2245 TALLOC_CTX *mem_ctx;
2248 mem_ctx = talloc_new(NULL);
2250 /* Read the interfaces from the local node */
2251 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2252 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2253 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2254 /* We could return an error. However, this will be
2255 * rare so we'll decide that the interfaces have
2256 * actually changed, just in case.
2258 talloc_free(mem_ctx);
2263 /* We haven't been here before so things have changed */
2264 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2266 } else if (rec->ifaces->num != ifaces->num) {
2267 /* Number of interfaces has changed */
2268 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2269 rec->ifaces->num, ifaces->num));
2272 /* See if interface names or link states have changed */
2274 for (i = 0; i < rec->ifaces->num; i++) {
2275 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2276 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2278 ("Interface in slot %d changed: %s => %s\n",
2279 i, iface->name, ifaces->ifaces[i].name));
2283 if (iface->link_state != ifaces->ifaces[i].link_state) {
2285 ("Interface %s changed state: %d => %d\n",
2286 iface->name, iface->link_state,
2287 ifaces->ifaces[i].link_state));
2294 talloc_free(rec->ifaces);
2295 rec->ifaces = talloc_steal(rec, ifaces);
2297 talloc_free(mem_ctx);
2301 /* Check that the local allocation of public IP addresses is correct
2302 * and do some house-keeping */
2303 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2304 struct ctdb_recoverd *rec,
2306 struct ctdb_node_map_old *nodemap)
2308 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2310 bool need_takeover_run = false;
2311 struct ctdb_public_ip_list_old *ips = NULL;
2313 /* If we are not the recmaster then do some housekeeping */
2314 if (rec->recmaster != pnn) {
2315 /* Ignore any IP reallocate requests - only recmaster
2318 TALLOC_FREE(rec->reallocate_requests);
2319 /* Clear any nodes that should be force rebalanced in
2320 * the next takeover run. If the recovery master role
2321 * has moved then we don't want to process these some
2322 * time in the future.
2324 TALLOC_FREE(rec->force_rebalance_nodes);
2327 /* Return early if disabled... */
2328 if (ctdb->tunable.disable_ip_failover != 0 ||
2329 ctdb_op_is_disabled(rec->takeover_run)) {
2333 if (interfaces_have_changed(ctdb, rec)) {
2334 need_takeover_run = true;
2337 /* If there are unhosted IPs but this node can host them then
2338 * trigger an IP reallocation */
2340 /* Read *available* IPs from local node */
2341 ret = ctdb_ctrl_get_public_ips_flags(
2342 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2343 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2345 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2346 talloc_free(mem_ctx);
2350 for (j=0; j<ips->num; j++) {
2351 if (ips->ips[j].pnn == -1 &&
2352 nodemap->nodes[pnn].flags == 0) {
2353 DEBUG(DEBUG_WARNING,
2354 ("Unassigned IP %s can be served by this node\n",
2355 ctdb_addr_to_str(&ips->ips[j].addr)));
2356 need_takeover_run = true;
2362 if (!ctdb->do_checkpublicip) {
2366 /* Validate the IP addresses that this node has on network
2367 * interfaces. If there is an inconsistency between reality
2368 * and the state expected by CTDB then try to fix it by
2369 * triggering an IP reallocation or releasing extraneous IP
2372 /* Read *known* IPs from local node */
2373 ret = ctdb_ctrl_get_public_ips_flags(
2374 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2376 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2377 talloc_free(mem_ctx);
2381 for (j=0; j<ips->num; j++) {
2382 if (ips->ips[j].pnn == pnn) {
2383 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2385 ("Assigned IP %s not on an interface\n",
2386 ctdb_addr_to_str(&ips->ips[j].addr)));
2387 need_takeover_run = true;
2390 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2392 ("IP %s incorrectly on an interface\n",
2393 ctdb_addr_to_str(&ips->ips[j].addr)));
2394 need_takeover_run = true;
2400 if (need_takeover_run) {
2401 struct ctdb_srvid_message rd;
2404 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2409 data.dptr = (uint8_t *)&rd;
2410 data.dsize = sizeof(rd);
2412 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2415 ("Failed to send takeover run request\n"));
2418 talloc_free(mem_ctx);
2423 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2425 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2427 if (node_pnn >= ctdb->num_nodes) {
2428 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2432 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2436 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2437 struct ctdb_node_map_old *nodemap,
2438 struct ctdb_node_map_old **remote_nodemaps)
2442 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2443 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2445 CONTROL_TIMEOUT(), false, tdb_null,
2446 async_getnodemap_callback,
2448 remote_nodemaps) != 0) {
2449 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2457 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2458 TALLOC_CTX *mem_ctx)
2460 struct ctdb_context *ctdb = rec->ctdb;
2461 uint32_t pnn = ctdb_get_pnn(ctdb);
2462 struct ctdb_node_map_old *nodemap = rec->nodemap;
2463 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2466 /* When recovery daemon is started, recmaster is set to
2467 * "unknown" so it knows to start an election.
2469 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2471 ("Initial recovery master set - forcing election\n"));
2472 force_election(rec, pnn, nodemap);
2477 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2478 * but we have, then force an election and try to become the new
2481 if (!ctdb_node_has_capabilities(rec->caps,
2483 CTDB_CAP_RECMASTER) &&
2484 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2485 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2487 (" Current recmaster node %u does not have CAP_RECMASTER,"
2488 " but we (node %u) have - force an election\n",
2489 rec->recmaster, pnn));
2490 force_election(rec, pnn, nodemap);
2494 /* Verify that the master node has not been deleted. This
2495 * should not happen because a node should always be shutdown
2496 * before being deleted, causing a new master to be elected
2497 * before now. However, if something strange has happened
2498 * then checking here will ensure we don't index beyond the
2499 * end of the nodemap array. */
2500 if (rec->recmaster >= nodemap->num) {
2502 ("Recmaster node %u has been deleted. Force election\n",
2504 force_election(rec, pnn, nodemap);
2508 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2509 if (nodemap->nodes[rec->recmaster].flags &
2510 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2512 ("Recmaster node %u is disconnected/deleted. Force election\n",
2514 force_election(rec, pnn, nodemap);
2518 /* get nodemap from the recovery master to check if it is inactive */
2519 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2520 mem_ctx, &recmaster_nodemap);
2524 " Unable to get nodemap from recovery master %u\n",
2526 /* No election, just error */
2531 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2532 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2534 ("Recmaster node %u is inactive. Force election\n",
2537 * update our nodemap to carry the recmaster's notion of
2538 * its own flags, so that we don't keep freezing the
2539 * inactive recmaster node...
2541 nodemap->nodes[rec->recmaster].flags =
2542 recmaster_nodemap->nodes[rec->recmaster].flags;
2543 force_election(rec, pnn, nodemap);
2550 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2551 TALLOC_CTX *mem_ctx)
2554 struct ctdb_node_map_old *nodemap=NULL;
2555 struct ctdb_node_map_old **remote_nodemaps=NULL;
2556 struct ctdb_vnn_map *vnnmap=NULL;
2557 struct ctdb_vnn_map *remote_vnnmap=NULL;
2558 uint32_t num_lmasters;
2559 int32_t debug_level;
2564 /* verify that the main daemon is still running */
2565 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2566 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2570 /* ping the local daemon to tell it we are alive */
2571 ctdb_ctrl_recd_ping(ctdb);
2573 if (rec->election_timeout) {
2574 /* an election is in progress */
2578 /* read the debug level from the parent and update locally */
2579 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2581 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2584 DEBUGLEVEL = debug_level;
2586 /* get relevant tunables */
2587 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2589 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2594 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2595 CTDB_CURRENT_NODE, &ctdb->runstate);
2597 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2601 pnn = ctdb_get_pnn(ctdb);
2604 TALLOC_FREE(rec->nodemap);
2605 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2607 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2610 nodemap = rec->nodemap;
2612 /* remember our own node flags */
2613 rec->node_flags = nodemap->nodes[pnn].flags;
2615 ban_misbehaving_nodes(rec, &self_ban);
2617 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2621 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2622 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2624 D_ERR("Failed to read recmode from local node\n");
2628 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2629 also frozen and that the recmode is set to active.
2631 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2632 /* If this node has become inactive then we want to
2633 * reduce the chances of it taking over the recovery
2634 * master role when it becomes active again. This
2635 * helps to stabilise the recovery master role so that
2636 * it stays on the most stable node.
2638 rec->priority_time = timeval_current();
2640 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2641 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2643 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2645 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2650 if (! rec->frozen_on_inactive) {
2651 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2655 (__location__ " Failed to freeze node "
2656 "in STOPPED or BANNED state\n"));
2660 rec->frozen_on_inactive = true;
2663 /* If this node is stopped or banned then it is not the recovery
2664 * master, so don't do anything. This prevents stopped or banned
2665 * node from starting election and sending unnecessary controls.
2670 rec->frozen_on_inactive = false;
2672 /* Retrieve capabilities from all connected nodes */
2673 ret = update_capabilities(rec, nodemap);
2675 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2679 if (! validate_recovery_master(rec, mem_ctx)) {
2683 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2684 /* Check if an IP takeover run is needed and trigger one if
2686 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2689 /* if we are not the recmaster then we do not need to check
2690 if recovery is needed
2692 if (pnn != rec->recmaster) {
2697 /* ensure our local copies of flags are right */
2698 ret = update_local_flags(rec, nodemap);
2700 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2704 if (ctdb->num_nodes != nodemap->num) {
2705 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2706 ctdb_load_nodes_file(ctdb);
2710 /* verify that all active nodes agree that we are the recmaster */
2711 switch (verify_recmaster(rec, nodemap, pnn)) {
2712 case MONITOR_RECOVERY_NEEDED:
2713 /* can not happen */
2715 case MONITOR_ELECTION_NEEDED:
2716 force_election(rec, pnn, nodemap);
2720 case MONITOR_FAILED:
2725 /* get the vnnmap */
2726 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2728 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2732 if (rec->need_recovery) {
2733 /* a previous recovery didn't finish */
2734 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2738 /* verify that all active nodes are in normal mode
2739 and not in recovery mode
2741 switch (verify_recmode(ctdb, nodemap)) {
2742 case MONITOR_RECOVERY_NEEDED:
2743 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2745 case MONITOR_FAILED:
2747 case MONITOR_ELECTION_NEEDED:
2748 /* can not happen */
2754 if (ctdb->recovery_lock != NULL) {
2755 /* We must already hold the recovery lock */
2756 if (!ctdb_recovery_have_lock(rec)) {
2757 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2758 ctdb_set_culprit(rec, ctdb->pnn);
2759 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2765 /* If recoveries are disabled then there is no use doing any
2766 * nodemap or flags checks. Recoveries might be disabled due
2767 * to "reloadnodes", so doing these checks might cause an
2768 * unnecessary recovery. */
2769 if (ctdb_op_is_disabled(rec->recovery)) {
2770 goto takeover_run_checks;
2773 /* get the nodemap for all active remote nodes
2775 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2776 if (remote_nodemaps == NULL) {
2777 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2780 for(i=0; i<nodemap->num; i++) {
2781 remote_nodemaps[i] = NULL;
2783 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2784 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2788 /* verify that all other nodes have the same nodemap as we have
2790 for (j=0; j<nodemap->num; j++) {
2791 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2795 if (remote_nodemaps[j] == NULL) {
2796 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2797 ctdb_set_culprit(rec, j);
2802 /* if the nodes disagree on how many nodes there are
2803 then this is a good reason to try recovery
2805 if (remote_nodemaps[j]->num != nodemap->num) {
2806 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2807 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2808 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2809 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2813 /* if the nodes disagree on which nodes exist and are
2814 active, then that is also a good reason to do recovery
2816 for (i=0;i<nodemap->num;i++) {
2817 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2818 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2819 nodemap->nodes[j].pnn, i,
2820 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2821 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2822 do_recovery(rec, mem_ctx, pnn, nodemap,
2830 * Update node flags obtained from each active node. This ensure we have
2831 * up-to-date information for all the nodes.
2833 for (j=0; j<nodemap->num; j++) {
2834 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2837 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2840 for (j=0; j<nodemap->num; j++) {
2841 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2845 /* verify the flags are consistent
2847 for (i=0; i<nodemap->num; i++) {
2848 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2852 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2853 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2854 nodemap->nodes[j].pnn,
2855 nodemap->nodes[i].pnn,
2856 remote_nodemaps[j]->nodes[i].flags,
2857 nodemap->nodes[i].flags));
2859 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2860 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2861 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2862 do_recovery(rec, mem_ctx, pnn, nodemap,
2866 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2867 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2868 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2869 do_recovery(rec, mem_ctx, pnn, nodemap,
2878 /* count how many active nodes there are */
2880 for (i=0; i<nodemap->num; i++) {
2881 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2882 if (ctdb_node_has_capabilities(rec->caps,
2883 ctdb->nodes[i]->pnn,
2884 CTDB_CAP_LMASTER)) {
2891 /* There must be the same number of lmasters in the vnn map as
2892 * there are active nodes with the lmaster capability... or
2895 if (vnnmap->size != num_lmasters) {
2896 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2897 vnnmap->size, num_lmasters));
2898 ctdb_set_culprit(rec, ctdb->pnn);
2899 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2903 /* verify that all active nodes in the nodemap also exist in
2906 for (j=0; j<nodemap->num; j++) {
2907 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2910 if (nodemap->nodes[j].pnn == pnn) {
2914 for (i=0; i<vnnmap->size; i++) {
2915 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2919 if (i == vnnmap->size) {
2920 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2921 nodemap->nodes[j].pnn));
2922 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2923 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2929 /* verify that all other nodes have the same vnnmap
2930 and are from the same generation
2932 for (j=0; j<nodemap->num; j++) {
2933 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2936 if (nodemap->nodes[j].pnn == pnn) {
2940 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2941 mem_ctx, &remote_vnnmap);
2943 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2944 nodemap->nodes[j].pnn));
2948 /* verify the vnnmap generation is the same */
2949 if (vnnmap->generation != remote_vnnmap->generation) {
2950 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2951 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2952 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2953 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2957 /* verify the vnnmap size is the same */
2958 if (vnnmap->size != remote_vnnmap->size) {
2959 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2960 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2961 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2962 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2966 /* verify the vnnmap is the same */
2967 for (i=0;i<vnnmap->size;i++) {
2968 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2969 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2970 nodemap->nodes[j].pnn));
2971 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2972 do_recovery(rec, mem_ctx, pnn, nodemap,
2979 /* FIXME: Add remote public IP checking to ensure that nodes
2980 * have the IP addresses that are allocated to them. */
2982 takeover_run_checks:
2984 /* If there are IP takeover runs requested or the previous one
2985 * failed then perform one and notify the waiters */
2986 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2987 (rec->reallocate_requests || rec->need_takeover_run)) {
2988 process_ipreallocate_requests(ctdb, rec);
2992 static void recd_sig_term_handler(struct tevent_context *ev,
2993 struct tevent_signal *se, int signum,
2994 int count, void *dont_care,
2997 struct ctdb_recoverd *rec = talloc_get_type_abort(
2998 private_data, struct ctdb_recoverd);
3000 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3001 ctdb_recovery_unlock(rec);
3007 the main monitoring loop
3009 static void monitor_cluster(struct ctdb_context *ctdb)
3011 struct tevent_signal *se;
3012 struct ctdb_recoverd *rec;
3014 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3016 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3017 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3020 rec->recmaster = CTDB_UNKNOWN_PNN;
3021 rec->recovery_lock_handle = NULL;
3023 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3024 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3026 rec->recovery = ctdb_op_init(rec, "recoveries");
3027 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3029 rec->priority_time = timeval_current();
3030 rec->frozen_on_inactive = false;
3032 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3033 recd_sig_term_handler, rec);
3035 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3039 /* register a message port for sending memory dumps */
3040 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3042 /* when a node is assigned banning credits */
3043 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3044 banning_handler, rec);
3046 /* register a message port for recovery elections */
3047 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3049 /* when nodes are disabled/enabled */
3050 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3052 /* when we are asked to puch out a flag change */
3053 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3055 /* register a message port for vacuum fetch */
3056 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3058 /* register a message port for reloadnodes */
3059 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3061 /* register a message port for performing a takeover run */
3062 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3064 /* register a message port for disabling the ip check for a short while */
3065 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3067 /* register a message port for forcing a rebalance of a node next
3069 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3071 /* Register a message port for disabling takeover runs */
3072 ctdb_client_set_message_handler(ctdb,
3073 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3074 disable_takeover_runs_handler, rec);
3076 /* Register a message port for disabling recoveries */
3077 ctdb_client_set_message_handler(ctdb,
3078 CTDB_SRVID_DISABLE_RECOVERIES,
3079 disable_recoveries_handler, rec);
3081 /* register a message port for detaching database */
3082 ctdb_client_set_message_handler(ctdb,
3083 CTDB_SRVID_DETACH_DATABASE,
3084 detach_database_handler, rec);
3087 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3088 struct timeval start;
3092 DEBUG(DEBUG_CRIT,(__location__
3093 " Failed to create temp context\n"));
3097 start = timeval_current();
3098 main_loop(ctdb, rec, mem_ctx);
3099 talloc_free(mem_ctx);
3101 /* we only check for recovery once every second */
3102 elapsed = timeval_elapsed(&start);
3103 if (elapsed < ctdb->tunable.recover_interval) {
3104 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3111 event handler for when the main ctdbd dies
3113 static void ctdb_recoverd_parent(struct tevent_context *ev,
3114 struct tevent_fd *fde,
3115 uint16_t flags, void *private_data)
3117 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3122 called regularly to verify that the recovery daemon is still running
3124 static void ctdb_check_recd(struct tevent_context *ev,
3125 struct tevent_timer *te,
3126 struct timeval yt, void *p)
3128 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3130 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3131 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3133 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3134 ctdb_restart_recd, ctdb);
3139 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3140 timeval_current_ofs(30, 0),
3141 ctdb_check_recd, ctdb);
3144 static void recd_sig_child_handler(struct tevent_context *ev,
3145 struct tevent_signal *se, int signum,
3146 int count, void *dont_care,
3149 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3154 pid = waitpid(-1, &status, WNOHANG);
3156 if (errno != ECHILD) {
3157 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3162 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3168 startup the recovery daemon as a child of the main ctdb daemon
3170 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3173 struct tevent_signal *se;
3174 struct tevent_fd *fde;
3177 if (pipe(fd) != 0) {
3181 ctdb->recoverd_pid = ctdb_fork(ctdb);
3182 if (ctdb->recoverd_pid == -1) {
3186 if (ctdb->recoverd_pid != 0) {
3187 talloc_free(ctdb->recd_ctx);
3188 ctdb->recd_ctx = talloc_new(ctdb);
3189 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3192 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3193 timeval_current_ofs(30, 0),
3194 ctdb_check_recd, ctdb);
3200 srandom(getpid() ^ time(NULL));
3202 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3207 prctl_set_comment("ctdb_recovered");
3208 if (switch_from_server_to_client(ctdb) != 0) {
3209 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3213 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3215 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3216 ctdb_recoverd_parent, &fd[0]);
3217 tevent_fd_set_auto_close(fde);
3219 /* set up a handler to pick up sigchld */
3220 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3221 recd_sig_child_handler, ctdb);
3223 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3227 monitor_cluster(ctdb);
3229 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3234 shutdown the recovery daemon
3236 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3238 if (ctdb->recoverd_pid == 0) {
3242 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3243 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3245 TALLOC_FREE(ctdb->recd_ctx);
3246 TALLOC_FREE(ctdb->recd_ping_count);
3249 static void ctdb_restart_recd(struct tevent_context *ev,
3250 struct tevent_timer *te,
3251 struct timeval t, void *private_data)
3253 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3255 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3256 ctdb_stop_recoverd(ctdb);
3257 ctdb_start_recoverd(ctdb);