4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
89 if (*requests == NULL) {
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
116 t = talloc_zero(*requests, struct srvid_list);
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
153 state->in_progress = false;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
169 ("Unable to begin - %s are disabled\n", state->name));
173 state->in_progress = true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
213 if (state->in_progress) {
215 ("Unable to disable %s - in progress\n", state->name));
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
237 struct ctdb_banning_state {
239 struct timeval last_reported_time;
243 private state of recovery daemon
245 struct ctdb_recoverd {
246 struct ctdb_context *ctdb;
248 uint32_t last_culprit_node;
249 struct ctdb_node_map_old *nodemap;
250 struct timeval priority_time;
251 bool need_takeover_run;
254 struct tevent_timer *send_election_te;
255 struct tevent_timer *election_timeout;
256 struct srvid_requests *reallocate_requests;
257 struct ctdb_op_state *takeover_run;
258 struct ctdb_op_state *recovery;
259 struct ctdb_iface_list_old *ifaces;
260 uint32_t *force_rebalance_nodes;
261 struct ctdb_node_capabilities *caps;
262 bool frozen_on_inactive;
263 struct ctdb_cluster_mutex_handle *recovery_lock_handle;
266 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
267 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
269 static void ctdb_restart_recd(struct tevent_context *ev,
270 struct tevent_timer *te, struct timeval t,
274 ban a node for a period of time
276 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
279 struct ctdb_context *ctdb = rec->ctdb;
280 struct ctdb_ban_state bantime;
282 if (!ctdb_validate_pnn(ctdb, pnn)) {
283 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
287 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
290 bantime.time = ban_time;
292 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
294 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
300 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
304 remember the trouble maker
306 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
308 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
309 struct ctdb_banning_state *ban_state;
311 if (culprit > ctdb->num_nodes) {
312 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
316 /* If we are banned or stopped, do not set other nodes as culprits */
317 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
318 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
322 if (ctdb->nodes[culprit]->ban_state == NULL) {
323 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
324 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
328 ban_state = ctdb->nodes[culprit]->ban_state;
329 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
330 /* this was the first time in a long while this node
331 misbehaved so we will forgive any old transgressions.
333 ban_state->count = 0;
336 ban_state->count += count;
337 ban_state->last_reported_time = timeval_current();
338 rec->last_culprit_node = culprit;
342 remember the trouble maker
344 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
346 ctdb_set_culprit_count(rec, culprit, 1);
350 Retrieve capabilities from all connected nodes
352 static int update_capabilities(struct ctdb_recoverd *rec,
353 struct ctdb_node_map_old *nodemap)
357 struct ctdb_node_capabilities *caps;
358 struct ctdb_context *ctdb = rec->ctdb;
360 tmp_ctx = talloc_new(rec);
361 CTDB_NO_MEMORY(ctdb, tmp_ctx);
363 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
364 CONTROL_TIMEOUT(), nodemap);
368 (__location__ " Failed to get node capabilities\n"));
369 talloc_free(tmp_ctx);
373 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
377 " Capabilities don't include current node.\n"));
378 talloc_free(tmp_ctx);
381 ctdb->capabilities = *capp;
383 TALLOC_FREE(rec->caps);
384 rec->caps = talloc_steal(rec, caps);
386 talloc_free(tmp_ctx);
391 change recovery mode on all nodes
393 static int set_recovery_mode(struct ctdb_context *ctdb,
394 struct ctdb_recoverd *rec,
395 struct ctdb_node_map_old *nodemap,
402 tmp_ctx = talloc_new(ctdb);
403 CTDB_NO_MEMORY(ctdb, tmp_ctx);
405 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
407 data.dsize = sizeof(uint32_t);
408 data.dptr = (unsigned char *)&rec_mode;
410 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
416 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
417 talloc_free(tmp_ctx);
421 talloc_free(tmp_ctx);
426 ensure all other nodes have attached to any databases that we have
428 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
429 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
432 struct ctdb_dbid_map_old *remote_dbmap;
434 /* verify that all other nodes have all our databases */
435 for (j=0; j<nodemap->num; j++) {
436 /* we don't need to ourself ourselves */
437 if (nodemap->nodes[j].pnn == pnn) {
440 /* don't check nodes that are unavailable */
441 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
445 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
446 mem_ctx, &remote_dbmap);
448 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
452 /* step through all local databases */
453 for (db=0; db<dbmap->num;db++) {
457 for (i=0;i<remote_dbmap->num;i++) {
458 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
462 /* the remote node already have this database */
463 if (i!=remote_dbmap->num) {
466 /* ok so we need to create this database */
467 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
468 dbmap->dbs[db].db_id, mem_ctx,
471 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
474 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
475 nodemap->nodes[j].pnn,
477 dbmap->dbs[db].flags, NULL);
479 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
490 ensure we are attached to any databases that anyone else is attached to
492 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
493 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
496 struct ctdb_dbid_map_old *remote_dbmap;
498 /* verify that we have all database any other node has */
499 for (j=0; j<nodemap->num; j++) {
500 /* we don't need to ourself ourselves */
501 if (nodemap->nodes[j].pnn == pnn) {
504 /* don't check nodes that are unavailable */
505 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
509 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
510 mem_ctx, &remote_dbmap);
512 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
516 /* step through all databases on the remote node */
517 for (db=0; db<remote_dbmap->num;db++) {
520 for (i=0;i<(*dbmap)->num;i++) {
521 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
525 /* we already have this db locally */
526 if (i!=(*dbmap)->num) {
529 /* ok so we need to create this database and
532 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
533 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
535 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
536 nodemap->nodes[j].pnn));
539 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
541 remote_dbmap->dbs[db].flags, NULL);
543 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
546 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
548 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
558 update flags on all active nodes
560 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
564 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
566 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
574 called when a vacuum fetch has completed - just free it and do the next one
576 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
583 * Process one elements of the vacuum fetch list:
584 * Migrate it over to us with the special flag
585 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
587 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
589 struct ctdb_rec_data_old *r)
591 struct ctdb_client_call_state *state;
593 struct ctdb_ltdb_header *hdr;
594 struct ctdb_call call;
597 call.call_id = CTDB_NULL_FUNC;
598 call.flags = CTDB_IMMEDIATE_MIGRATION;
599 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
601 call.key.dptr = &r->data[0];
602 call.key.dsize = r->keylen;
604 /* ensure we don't block this daemon - just skip a record if we can't get
606 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
610 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
611 if (data.dptr == NULL) {
612 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
616 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
618 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
622 hdr = (struct ctdb_ltdb_header *)data.dptr;
623 if (hdr->dmaster == pnn) {
624 /* its already local */
626 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
632 state = ctdb_call_send(ctdb_db, &call);
633 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
635 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
638 state->async.fn = vacuum_fetch_callback;
639 state->async.private_data = NULL;
646 handler for vacuum fetch
648 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
651 struct ctdb_recoverd *rec = talloc_get_type(
652 private_data, struct ctdb_recoverd);
653 struct ctdb_context *ctdb = rec->ctdb;
654 struct ctdb_marshall_buffer *recs;
656 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
658 struct ctdb_dbid_map_old *dbmap=NULL;
659 uint8_t db_flags = 0;
660 struct ctdb_db_context *ctdb_db;
661 struct ctdb_rec_data_old *r;
663 recs = (struct ctdb_marshall_buffer *)data.dptr;
665 if (recs->count == 0) {
669 /* work out if the database is persistent */
670 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
672 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
676 for (i=0;i<dbmap->num;i++) {
677 if (dbmap->dbs[i].db_id == recs->db_id) {
678 db_flags = dbmap->dbs[i].flags;
682 if (i == dbmap->num) {
683 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
687 /* find the name of this database */
688 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
689 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
694 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
695 if (ctdb_db == NULL) {
696 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
700 r = (struct ctdb_rec_data_old *)&recs->data[0];
701 while (recs->count) {
704 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
709 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
714 talloc_free(tmp_ctx);
719 * handler for database detach
721 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
724 struct ctdb_recoverd *rec = talloc_get_type(
725 private_data, struct ctdb_recoverd);
726 struct ctdb_context *ctdb = rec->ctdb;
728 struct ctdb_db_context *ctdb_db;
730 if (data.dsize != sizeof(db_id)) {
733 db_id = *(uint32_t *)data.dptr;
735 ctdb_db = find_ctdb_db(ctdb, db_id);
736 if (ctdb_db == NULL) {
737 /* database is not attached */
741 DLIST_REMOVE(ctdb->db_list, ctdb_db);
743 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
745 talloc_free(ctdb_db);
749 called when ctdb_wait_timeout should finish
751 static void ctdb_wait_handler(struct tevent_context *ev,
752 struct tevent_timer *te,
753 struct timeval yt, void *p)
755 uint32_t *timed_out = (uint32_t *)p;
760 wait for a given number of seconds
762 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
764 uint32_t timed_out = 0;
765 time_t usecs = (secs - (time_t)secs) * 1000000;
766 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
767 ctdb_wait_handler, &timed_out);
769 tevent_loop_once(ctdb->ev);
774 called when an election times out (ends)
776 static void ctdb_election_timeout(struct tevent_context *ev,
777 struct tevent_timer *te,
778 struct timeval t, void *p)
780 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
781 rec->election_timeout = NULL;
784 DEBUG(DEBUG_WARNING,("Election period ended\n"));
789 wait for an election to finish. It finished election_timeout seconds after
790 the last election packet is received
792 static void ctdb_wait_election(struct ctdb_recoverd *rec)
794 struct ctdb_context *ctdb = rec->ctdb;
795 while (rec->election_timeout) {
796 tevent_loop_once(ctdb->ev);
801 Update our local flags from all remote connected nodes.
802 This is only run when we are or we belive we are the recovery master
804 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
807 struct ctdb_context *ctdb = rec->ctdb;
808 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
810 /* get the nodemap for all active remote nodes and verify
811 they are the same as for this node
813 for (j=0; j<nodemap->num; j++) {
814 struct ctdb_node_map_old *remote_nodemap=NULL;
817 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
820 if (nodemap->nodes[j].pnn == ctdb->pnn) {
824 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
825 mem_ctx, &remote_nodemap);
827 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
828 nodemap->nodes[j].pnn));
829 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
830 talloc_free(mem_ctx);
833 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
834 /* We should tell our daemon about this so it
835 updates its flags or else we will log the same
836 message again in the next iteration of recovery.
837 Since we are the recovery master we can just as
838 well update the flags on all nodes.
840 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
842 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
846 /* Update our local copy of the flags in the recovery
849 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
850 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
851 nodemap->nodes[j].flags));
852 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
854 talloc_free(remote_nodemap);
856 talloc_free(mem_ctx);
861 /* Create a new random generation id.
862 The generation id can not be the INVALID_GENERATION id
864 static uint32_t new_generation(void)
869 generation = random();
871 if (generation != INVALID_GENERATION) {
879 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
881 return (rec->recovery_lock_handle != NULL);
884 struct hold_reclock_state {
890 static void take_reclock_handler(char status,
894 struct hold_reclock_state *s =
895 (struct hold_reclock_state *) private_data;
899 s->latency = latency;
904 ("Unable to take recovery lock - contention\n"));
908 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
912 s->locked = (status == '0') ;
915 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
917 static void lost_reclock_handler(void *private_data)
919 struct ctdb_recoverd *rec = talloc_get_type_abort(
920 private_data, struct ctdb_recoverd);
923 ("Recovery lock helper terminated unexpectedly - "
924 "trying to retake recovery lock\n"));
925 TALLOC_FREE(rec->recovery_lock_handle);
926 if (! ctdb_recovery_lock(rec)) {
927 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
931 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
933 struct ctdb_context *ctdb = rec->ctdb;
934 struct ctdb_cluster_mutex_handle *h;
935 struct hold_reclock_state s = {
941 h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
942 take_reclock_handler, &s,
943 lost_reclock_handler, rec);
949 tevent_loop_once(ctdb->ev);
957 rec->recovery_lock_handle = h;
958 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
964 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
966 if (rec->recovery_lock_handle != NULL) {
967 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
968 TALLOC_FREE(rec->recovery_lock_handle);
972 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
974 struct ctdb_context *ctdb = rec->ctdb;
976 struct ctdb_banning_state *ban_state;
979 for (i=0; i<ctdb->num_nodes; i++) {
980 if (ctdb->nodes[i]->ban_state == NULL) {
983 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
984 if (ban_state->count < 2*ctdb->num_nodes) {
988 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
989 ctdb->nodes[i]->pnn, ban_state->count,
990 ctdb->tunable.recovery_ban_period));
991 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
992 ban_state->count = 0;
994 /* Banning ourself? */
995 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1001 struct helper_state {
1008 static void helper_handler(struct tevent_context *ev,
1009 struct tevent_fd *fde,
1010 uint16_t flags, void *private_data)
1012 struct helper_state *state = talloc_get_type_abort(
1013 private_data, struct helper_state);
1016 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1017 if (ret != sizeof(state->result)) {
1018 state->result = EPIPE;
1024 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1025 const char *prog, const char *arg, const char *type)
1027 struct helper_state *state;
1028 struct tevent_fd *fde;
1031 uint32_t recmaster = rec->recmaster;
1033 state = talloc_zero(mem_ctx, struct helper_state);
1034 if (state == NULL) {
1035 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1041 ret = pipe(state->fd);
1044 ("Failed to create pipe for %s helper\n", type));
1048 set_close_on_exec(state->fd[0]);
1051 args = talloc_array(state, const char *, nargs);
1053 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1057 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1058 if (args[0] == NULL) {
1059 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1062 args[1] = rec->ctdb->daemon.name;
1066 if (args[2] == NULL) {
1070 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1071 if (state->pid == -1) {
1073 ("Failed to create child for %s helper\n", type));
1077 close(state->fd[1]);
1080 state->done = false;
1082 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1083 TEVENT_FD_READ, helper_handler, state);
1087 tevent_fd_set_auto_close(fde);
1089 while (!state->done) {
1090 tevent_loop_once(rec->ctdb->ev);
1092 /* If recmaster changes, we have lost election */
1093 if (recmaster != rec->recmaster) {
1094 D_ERR("Recmaster changed to %u, aborting %s\n",
1095 rec->recmaster, type);
1101 close(state->fd[0]);
1104 if (state->result != 0) {
1108 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1113 if (state->fd[0] != -1) {
1114 close(state->fd[0]);
1116 if (state->fd[1] != -1) {
1117 close(state->fd[1]);
1119 if (state->pid != -1) {
1120 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1127 static int ctdb_takeover(struct ctdb_recoverd *rec,
1128 uint32_t *force_rebalance_nodes)
1130 static char prog[PATH_MAX+1] = "";
1134 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1135 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1136 "ctdb_takeover_helper")) {
1137 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1141 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1142 uint32_t pnn = force_rebalance_nodes[i];
1144 arg = talloc_asprintf(rec, "%u", pnn);
1146 arg = talloc_asprintf_append(arg, ",%u", pnn);
1149 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1154 if (ctdb_config.failover_disabled) {
1155 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1157 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1162 return helper_run(rec, rec, prog, arg, "takeover");
1165 static bool do_takeover_run(struct ctdb_recoverd *rec,
1166 struct ctdb_node_map_old *nodemap)
1168 uint32_t *nodes = NULL;
1169 struct ctdb_disable_message dtr;
1172 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1176 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1178 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1179 DEBUG(DEBUG_ERR, (__location__
1180 " takeover run already in progress \n"));
1185 if (!ctdb_op_begin(rec->takeover_run)) {
1190 /* Disable IP checks (takeover runs, really) on other nodes
1191 * while doing this takeover run. This will stop those other
1192 * nodes from triggering takeover runs when think they should
1193 * be hosting an IP but it isn't yet on an interface. Don't
1194 * wait for replies since a failure here might cause some
1195 * noise in the logs but will not actually cause a problem.
1198 dtr.srvid = 0; /* No reply */
1201 data.dptr = (uint8_t*)&dtr;
1202 data.dsize = sizeof(dtr);
1204 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1206 /* Disable for 60 seconds. This can be a tunable later if
1210 for (i = 0; i < talloc_array_length(nodes); i++) {
1211 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1212 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1214 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1218 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1220 /* Reenable takeover runs and IP checks on other nodes */
1222 for (i = 0; i < talloc_array_length(nodes); i++) {
1223 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1224 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1226 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1231 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1237 /* Takeover run was successful so clear force rebalance targets */
1238 if (rebalance_nodes == rec->force_rebalance_nodes) {
1239 TALLOC_FREE(rec->force_rebalance_nodes);
1241 DEBUG(DEBUG_WARNING,
1242 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1245 rec->need_takeover_run = !ok;
1247 ctdb_op_end(rec->takeover_run);
1249 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1253 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1255 static char prog[PATH_MAX+1] = "";
1258 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1259 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1260 "ctdb_recovery_helper")) {
1261 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1264 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1266 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1270 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1272 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1276 we are the recmaster, and recovery is needed - start a recovery run
1278 static int do_recovery(struct ctdb_recoverd *rec,
1279 TALLOC_CTX *mem_ctx, uint32_t pnn,
1280 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1282 struct ctdb_context *ctdb = rec->ctdb;
1284 struct ctdb_dbid_map_old *dbmap;
1287 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1289 /* Check if the current node is still the recmaster. It's possible that
1290 * re-election has changed the recmaster.
1292 if (pnn != rec->recmaster) {
1294 ("Recovery master changed to %u, aborting recovery\n",
1299 /* if recovery fails, force it again */
1300 rec->need_recovery = true;
1302 if (!ctdb_op_begin(rec->recovery)) {
1306 if (rec->election_timeout) {
1307 /* an election is in progress */
1308 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1312 ban_misbehaving_nodes(rec, &self_ban);
1314 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1318 if (ctdb->recovery_lock != NULL) {
1319 if (ctdb_recovery_have_lock(rec)) {
1320 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1322 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1323 ctdb->recovery_lock));
1324 if (!ctdb_recovery_lock(rec)) {
1325 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1326 /* If ctdb is trying first recovery, it's
1327 * possible that current node does not know
1328 * yet who the recmaster is.
1330 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1331 " - retrying recovery\n"));
1335 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1336 "and ban ourself for %u seconds\n",
1337 ctdb->tunable.recovery_ban_period));
1338 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1342 ("Recovery lock taken successfully by recovery daemon\n"));
1346 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1348 /* get a list of all databases */
1349 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1351 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1355 /* we do the db creation before we set the recovery mode, so the freeze happens
1356 on all databases we will be dealing with. */
1358 /* verify that we have all the databases any other node has */
1359 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1361 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1365 /* verify that all other nodes have all our databases */
1366 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1368 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1371 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1374 /* Retrieve capabilities from all connected nodes */
1375 ret = update_capabilities(rec, nodemap);
1377 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1382 update all nodes to have the same flags that we have
1384 for (i=0;i<nodemap->num;i++) {
1385 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1389 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1391 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1392 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1394 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1400 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1402 ret = db_recovery_parallel(rec, mem_ctx);
1407 do_takeover_run(rec, nodemap);
1409 /* send a message to all clients telling them that the cluster
1410 has been reconfigured */
1411 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1412 CTDB_SRVID_RECONFIGURE, tdb_null);
1414 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1418 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1420 rec->need_recovery = false;
1421 ctdb_op_end(rec->recovery);
1423 /* we managed to complete a full recovery, make sure to forgive
1424 any past sins by the nodes that could now participate in the
1427 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1428 for (i=0;i<nodemap->num;i++) {
1429 struct ctdb_banning_state *ban_state;
1431 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1435 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1436 if (ban_state == NULL) {
1440 ban_state->count = 0;
1443 /* We just finished a recovery successfully.
1444 We now wait for rerecovery_timeout before we allow
1445 another recovery to take place.
1447 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1448 ctdb_op_disable(rec->recovery, ctdb->ev,
1449 ctdb->tunable.rerecovery_timeout);
1453 ctdb_op_end(rec->recovery);
1459 elections are won by first checking the number of connected nodes, then
1460 the priority time, then the pnn
1462 struct election_message {
1463 uint32_t num_connected;
1464 struct timeval priority_time;
1466 uint32_t node_flags;
1470 form this nodes election data
1472 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1475 struct ctdb_node_map_old *nodemap;
1476 struct ctdb_context *ctdb = rec->ctdb;
1480 em->pnn = rec->ctdb->pnn;
1481 em->priority_time = rec->priority_time;
1483 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1485 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1489 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1490 em->node_flags = rec->node_flags;
1492 for (i=0;i<nodemap->num;i++) {
1493 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1494 em->num_connected++;
1498 /* we shouldnt try to win this election if we cant be a recmaster */
1499 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1500 em->num_connected = 0;
1501 em->priority_time = timeval_current();
1504 talloc_free(nodemap);
1508 see if the given election data wins
1510 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1512 struct election_message myem;
1515 ctdb_election_data(rec, &myem);
1517 /* we cant win if we don't have the recmaster capability */
1518 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1522 /* we cant win if we are banned */
1523 if (rec->node_flags & NODE_FLAGS_BANNED) {
1527 /* we cant win if we are stopped */
1528 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1532 /* we will automatically win if the other node is banned */
1533 if (em->node_flags & NODE_FLAGS_BANNED) {
1537 /* we will automatically win if the other node is banned */
1538 if (em->node_flags & NODE_FLAGS_STOPPED) {
1542 /* then the longest running node */
1544 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1548 cmp = (int)myem.pnn - (int)em->pnn;
1555 send out an election request
1557 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1560 TDB_DATA election_data;
1561 struct election_message emsg;
1563 struct ctdb_context *ctdb = rec->ctdb;
1565 srvid = CTDB_SRVID_ELECTION;
1567 ctdb_election_data(rec, &emsg);
1569 election_data.dsize = sizeof(struct election_message);
1570 election_data.dptr = (unsigned char *)&emsg;
1573 /* first we assume we will win the election and set
1574 recoverymaster to be ourself on the current node
1576 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1577 CTDB_CURRENT_NODE, pnn);
1579 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1582 rec->recmaster = pnn;
1584 /* send an election message to all active nodes */
1585 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1586 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1590 we think we are winning the election - send a broadcast election request
1592 static void election_send_request(struct tevent_context *ev,
1593 struct tevent_timer *te,
1594 struct timeval t, void *p)
1596 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1599 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1601 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1604 TALLOC_FREE(rec->send_election_te);
1608 handler for memory dumps
1610 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1612 struct ctdb_recoverd *rec = talloc_get_type(
1613 private_data, struct ctdb_recoverd);
1614 struct ctdb_context *ctdb = rec->ctdb;
1615 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1618 struct ctdb_srvid_message *rd;
1620 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1621 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1622 talloc_free(tmp_ctx);
1625 rd = (struct ctdb_srvid_message *)data.dptr;
1627 dump = talloc_zero(tmp_ctx, TDB_DATA);
1629 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1630 talloc_free(tmp_ctx);
1633 ret = ctdb_dump_memory(ctdb, dump);
1635 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1636 talloc_free(tmp_ctx);
1640 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1642 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1644 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1645 talloc_free(tmp_ctx);
1649 talloc_free(tmp_ctx);
1653 handler for reload_nodes
1655 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1658 struct ctdb_recoverd *rec = talloc_get_type(
1659 private_data, struct ctdb_recoverd);
1661 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1663 ctdb_load_nodes_file(rec->ctdb);
1667 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1670 struct ctdb_recoverd *rec = talloc_get_type(
1671 private_data, struct ctdb_recoverd);
1672 struct ctdb_context *ctdb = rec->ctdb;
1677 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1681 if (data.dsize != sizeof(uint32_t)) {
1682 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1686 pnn = *(uint32_t *)&data.dptr[0];
1688 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1690 /* Copy any existing list of nodes. There's probably some
1691 * sort of realloc variant that will do this but we need to
1692 * make sure that freeing the old array also cancels the timer
1693 * event for the timeout... not sure if realloc will do that.
1695 len = (rec->force_rebalance_nodes != NULL) ?
1696 talloc_array_length(rec->force_rebalance_nodes) :
1699 /* This allows duplicates to be added but they don't cause
1700 * harm. A call to add a duplicate PNN arguably means that
1701 * the timeout should be reset, so this is the simplest
1704 t = talloc_zero_array(rec, uint32_t, len+1);
1705 CTDB_NO_MEMORY_VOID(ctdb, t);
1707 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1711 talloc_free(rec->force_rebalance_nodes);
1713 rec->force_rebalance_nodes = t;
1718 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1720 struct ctdb_op_state *op_state)
1722 struct ctdb_disable_message *r;
1727 /* Validate input data */
1728 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1729 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1730 "expecting %lu\n", (long unsigned)data.dsize,
1731 (long unsigned)sizeof(struct ctdb_srvid_message)));
1734 if (data.dptr == NULL) {
1735 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1739 r = (struct ctdb_disable_message *)data.dptr;
1740 timeout = r->timeout;
1742 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1747 /* Returning our PNN tells the caller that we succeeded */
1748 ret = ctdb_get_pnn(ctdb);
1750 result.dsize = sizeof(int32_t);
1751 result.dptr = (uint8_t *)&ret;
1752 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1755 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1758 struct ctdb_recoverd *rec = talloc_get_type(
1759 private_data, struct ctdb_recoverd);
1761 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1764 /* Backward compatibility for this SRVID */
1765 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1768 struct ctdb_recoverd *rec = talloc_get_type(
1769 private_data, struct ctdb_recoverd);
1772 if (data.dsize != sizeof(uint32_t)) {
1773 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1774 "expecting %lu\n", (long unsigned)data.dsize,
1775 (long unsigned)sizeof(uint32_t)));
1778 if (data.dptr == NULL) {
1779 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1783 timeout = *((uint32_t *)data.dptr);
1785 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1788 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1791 struct ctdb_recoverd *rec = talloc_get_type(
1792 private_data, struct ctdb_recoverd);
1794 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1798 handler for ip reallocate, just add it to the list of requests and
1799 handle this later in the monitor_cluster loop so we do not recurse
1800 with other requests to takeover_run()
1802 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1805 struct ctdb_srvid_message *request;
1806 struct ctdb_recoverd *rec = talloc_get_type(
1807 private_data, struct ctdb_recoverd);
1809 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1810 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1814 request = (struct ctdb_srvid_message *)data.dptr;
1816 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1819 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1820 struct ctdb_recoverd *rec)
1824 struct srvid_requests *current;
1826 /* Only process requests that are currently pending. More
1827 * might come in while the takeover run is in progress and
1828 * they will need to be processed later since they might
1829 * be in response flag changes.
1831 current = rec->reallocate_requests;
1832 rec->reallocate_requests = NULL;
1834 if (do_takeover_run(rec, rec->nodemap)) {
1835 ret = ctdb_get_pnn(ctdb);
1840 result.dsize = sizeof(int32_t);
1841 result.dptr = (uint8_t *)&ret;
1843 srvid_requests_reply(ctdb, ¤t, result);
1847 * handler for assigning banning credits
1849 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1851 struct ctdb_recoverd *rec = talloc_get_type(
1852 private_data, struct ctdb_recoverd);
1855 /* Ignore if we are not recmaster */
1856 if (rec->ctdb->pnn != rec->recmaster) {
1860 if (data.dsize != sizeof(uint32_t)) {
1861 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1866 ban_pnn = *(uint32_t *)data.dptr;
1868 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1872 handler for recovery master elections
1874 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1876 struct ctdb_recoverd *rec = talloc_get_type(
1877 private_data, struct ctdb_recoverd);
1878 struct ctdb_context *ctdb = rec->ctdb;
1880 struct election_message *em = (struct election_message *)data.dptr;
1882 /* Ignore election packets from ourself */
1883 if (ctdb->pnn == em->pnn) {
1887 /* we got an election packet - update the timeout for the election */
1888 talloc_free(rec->election_timeout);
1889 rec->election_timeout = tevent_add_timer(
1892 timeval_current_ofs(0, 500000) :
1893 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1894 ctdb_election_timeout, rec);
1896 /* someone called an election. check their election data
1897 and if we disagree and we would rather be the elected node,
1898 send a new election message to all other nodes
1900 if (ctdb_election_win(rec, em)) {
1901 if (!rec->send_election_te) {
1902 rec->send_election_te = tevent_add_timer(
1904 timeval_current_ofs(0, 500000),
1905 election_send_request, rec);
1911 TALLOC_FREE(rec->send_election_te);
1913 /* Release the recovery lock file */
1914 if (ctdb_recovery_have_lock(rec)) {
1915 ctdb_recovery_unlock(rec);
1918 /* ok, let that guy become recmaster then */
1919 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1920 CTDB_CURRENT_NODE, em->pnn);
1922 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1925 rec->recmaster = em->pnn;
1932 force the start of the election process
1934 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1935 struct ctdb_node_map_old *nodemap)
1938 struct ctdb_context *ctdb = rec->ctdb;
1940 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1942 /* set all nodes to recovery mode to stop all internode traffic */
1943 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1945 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1949 talloc_free(rec->election_timeout);
1950 rec->election_timeout = tevent_add_timer(
1953 timeval_current_ofs(0, 500000) :
1954 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1955 ctdb_election_timeout, rec);
1957 ret = send_election_request(rec, pnn);
1959 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1963 /* wait for a few seconds to collect all responses */
1964 ctdb_wait_election(rec);
1970 handler for when a node changes its flags
1972 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1974 struct ctdb_recoverd *rec = talloc_get_type(
1975 private_data, struct ctdb_recoverd);
1976 struct ctdb_context *ctdb = rec->ctdb;
1978 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1979 struct ctdb_node_map_old *nodemap=NULL;
1980 TALLOC_CTX *tmp_ctx;
1983 if (data.dsize != sizeof(*c)) {
1984 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1988 tmp_ctx = talloc_new(ctdb);
1989 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1991 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1993 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1994 talloc_free(tmp_ctx);
1999 for (i=0;i<nodemap->num;i++) {
2000 if (nodemap->nodes[i].pnn == c->pnn) break;
2003 if (i == nodemap->num) {
2004 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2005 talloc_free(tmp_ctx);
2009 if (c->old_flags != c->new_flags) {
2010 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2013 nodemap->nodes[i].flags = c->new_flags;
2015 talloc_free(tmp_ctx);
2019 handler for when we need to push out flag changes ot all other nodes
2021 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2024 struct ctdb_recoverd *rec = talloc_get_type(
2025 private_data, struct ctdb_recoverd);
2026 struct ctdb_context *ctdb = rec->ctdb;
2028 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2029 struct ctdb_node_map_old *nodemap=NULL;
2030 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2033 /* read the node flags from the recmaster */
2034 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2037 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2038 talloc_free(tmp_ctx);
2041 if (c->pnn >= nodemap->num) {
2042 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2043 talloc_free(tmp_ctx);
2047 /* send the flags update to all connected nodes */
2048 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2050 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2051 nodes, 0, CONTROL_TIMEOUT(),
2055 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2057 talloc_free(tmp_ctx);
2061 talloc_free(tmp_ctx);
2065 struct verify_recmode_normal_data {
2067 enum monitor_result status;
2070 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2072 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2075 /* one more node has responded with recmode data*/
2078 /* if we failed to get the recmode, then return an error and let
2079 the main loop try again.
2081 if (state->state != CTDB_CONTROL_DONE) {
2082 if (rmdata->status == MONITOR_OK) {
2083 rmdata->status = MONITOR_FAILED;
2088 /* if we got a response, then the recmode will be stored in the
2091 if (state->status != CTDB_RECOVERY_NORMAL) {
2092 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2093 rmdata->status = MONITOR_RECOVERY_NEEDED;
2100 /* verify that all nodes are in normal recovery mode */
2101 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2103 struct verify_recmode_normal_data *rmdata;
2104 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2105 struct ctdb_client_control_state *state;
2106 enum monitor_result status;
2109 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2110 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2112 rmdata->status = MONITOR_OK;
2114 /* loop over all active nodes and send an async getrecmode call to
2116 for (j=0; j<nodemap->num; j++) {
2117 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2120 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2122 nodemap->nodes[j].pnn);
2123 if (state == NULL) {
2124 /* we failed to send the control, treat this as
2125 an error and try again next iteration
2127 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2128 talloc_free(mem_ctx);
2129 return MONITOR_FAILED;
2132 /* set up the callback functions */
2133 state->async.fn = verify_recmode_normal_callback;
2134 state->async.private_data = rmdata;
2136 /* one more control to wait for to complete */
2141 /* now wait for up to the maximum number of seconds allowed
2142 or until all nodes we expect a response from has replied
2144 while (rmdata->count > 0) {
2145 tevent_loop_once(ctdb->ev);
2148 status = rmdata->status;
2149 talloc_free(mem_ctx);
2154 struct verify_recmaster_data {
2155 struct ctdb_recoverd *rec;
2158 enum monitor_result status;
2161 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2163 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2166 /* one more node has responded with recmaster data*/
2169 /* if we failed to get the recmaster, then return an error and let
2170 the main loop try again.
2172 if (state->state != CTDB_CONTROL_DONE) {
2173 if (rmdata->status == MONITOR_OK) {
2174 rmdata->status = MONITOR_FAILED;
2179 /* if we got a response, then the recmaster will be stored in the
2182 if (state->status != rmdata->pnn) {
2183 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2184 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2185 rmdata->status = MONITOR_ELECTION_NEEDED;
2192 /* verify that all nodes agree that we are the recmaster */
2193 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2195 struct ctdb_context *ctdb = rec->ctdb;
2196 struct verify_recmaster_data *rmdata;
2197 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2198 struct ctdb_client_control_state *state;
2199 enum monitor_result status;
2202 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2203 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2207 rmdata->status = MONITOR_OK;
2209 /* loop over all active nodes and send an async getrecmaster call to
2211 for (j=0; j<nodemap->num; j++) {
2212 if (nodemap->nodes[j].pnn == rec->recmaster) {
2215 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2218 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2220 nodemap->nodes[j].pnn);
2221 if (state == NULL) {
2222 /* we failed to send the control, treat this as
2223 an error and try again next iteration
2225 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2226 talloc_free(mem_ctx);
2227 return MONITOR_FAILED;
2230 /* set up the callback functions */
2231 state->async.fn = verify_recmaster_callback;
2232 state->async.private_data = rmdata;
2234 /* one more control to wait for to complete */
2239 /* now wait for up to the maximum number of seconds allowed
2240 or until all nodes we expect a response from has replied
2242 while (rmdata->count > 0) {
2243 tevent_loop_once(ctdb->ev);
2246 status = rmdata->status;
2247 talloc_free(mem_ctx);
2251 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2252 struct ctdb_recoverd *rec)
2254 struct ctdb_iface_list_old *ifaces = NULL;
2255 TALLOC_CTX *mem_ctx;
2258 mem_ctx = talloc_new(NULL);
2260 /* Read the interfaces from the local node */
2261 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2262 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2263 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2264 /* We could return an error. However, this will be
2265 * rare so we'll decide that the interfaces have
2266 * actually changed, just in case.
2268 talloc_free(mem_ctx);
2273 /* We haven't been here before so things have changed */
2274 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2276 } else if (rec->ifaces->num != ifaces->num) {
2277 /* Number of interfaces has changed */
2278 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2279 rec->ifaces->num, ifaces->num));
2282 /* See if interface names or link states have changed */
2284 for (i = 0; i < rec->ifaces->num; i++) {
2285 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2286 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2288 ("Interface in slot %d changed: %s => %s\n",
2289 i, iface->name, ifaces->ifaces[i].name));
2293 if (iface->link_state != ifaces->ifaces[i].link_state) {
2295 ("Interface %s changed state: %d => %d\n",
2296 iface->name, iface->link_state,
2297 ifaces->ifaces[i].link_state));
2304 talloc_free(rec->ifaces);
2305 rec->ifaces = talloc_steal(rec, ifaces);
2307 talloc_free(mem_ctx);
2311 /* Check that the local allocation of public IP addresses is correct
2312 * and do some house-keeping */
2313 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2314 struct ctdb_recoverd *rec,
2316 struct ctdb_node_map_old *nodemap)
2318 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2320 bool need_takeover_run = false;
2321 struct ctdb_public_ip_list_old *ips = NULL;
2323 /* If we are not the recmaster then do some housekeeping */
2324 if (rec->recmaster != pnn) {
2325 /* Ignore any IP reallocate requests - only recmaster
2328 TALLOC_FREE(rec->reallocate_requests);
2329 /* Clear any nodes that should be force rebalanced in
2330 * the next takeover run. If the recovery master role
2331 * has moved then we don't want to process these some
2332 * time in the future.
2334 TALLOC_FREE(rec->force_rebalance_nodes);
2337 /* Return early if disabled... */
2338 if (ctdb_config.failover_disabled ||
2339 ctdb_op_is_disabled(rec->takeover_run)) {
2343 if (interfaces_have_changed(ctdb, rec)) {
2344 need_takeover_run = true;
2347 /* If there are unhosted IPs but this node can host them then
2348 * trigger an IP reallocation */
2350 /* Read *available* IPs from local node */
2351 ret = ctdb_ctrl_get_public_ips_flags(
2352 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2353 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2355 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2356 talloc_free(mem_ctx);
2360 for (j=0; j<ips->num; j++) {
2361 if (ips->ips[j].pnn == -1 &&
2362 nodemap->nodes[pnn].flags == 0) {
2363 DEBUG(DEBUG_WARNING,
2364 ("Unassigned IP %s can be served by this node\n",
2365 ctdb_addr_to_str(&ips->ips[j].addr)));
2366 need_takeover_run = true;
2372 if (!ctdb->do_checkpublicip) {
2376 /* Validate the IP addresses that this node has on network
2377 * interfaces. If there is an inconsistency between reality
2378 * and the state expected by CTDB then try to fix it by
2379 * triggering an IP reallocation or releasing extraneous IP
2382 /* Read *known* IPs from local node */
2383 ret = ctdb_ctrl_get_public_ips_flags(
2384 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2386 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2387 talloc_free(mem_ctx);
2391 for (j=0; j<ips->num; j++) {
2392 if (ips->ips[j].pnn == pnn) {
2393 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2395 ("Assigned IP %s not on an interface\n",
2396 ctdb_addr_to_str(&ips->ips[j].addr)));
2397 need_takeover_run = true;
2400 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2402 ("IP %s incorrectly on an interface\n",
2403 ctdb_addr_to_str(&ips->ips[j].addr)));
2404 need_takeover_run = true;
2410 if (need_takeover_run) {
2411 struct ctdb_srvid_message rd;
2414 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2419 data.dptr = (uint8_t *)&rd;
2420 data.dsize = sizeof(rd);
2422 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2425 ("Failed to send takeover run request\n"));
2428 talloc_free(mem_ctx);
2433 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2435 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2437 if (node_pnn >= ctdb->num_nodes) {
2438 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2442 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2446 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2447 struct ctdb_node_map_old *nodemap,
2448 struct ctdb_node_map_old **remote_nodemaps)
2452 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2453 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2455 CONTROL_TIMEOUT(), false, tdb_null,
2456 async_getnodemap_callback,
2458 remote_nodemaps) != 0) {
2459 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2467 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2468 TALLOC_CTX *mem_ctx)
2470 struct ctdb_context *ctdb = rec->ctdb;
2471 uint32_t pnn = ctdb_get_pnn(ctdb);
2472 struct ctdb_node_map_old *nodemap = rec->nodemap;
2473 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2476 /* When recovery daemon is started, recmaster is set to
2477 * "unknown" so it knows to start an election.
2479 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2481 ("Initial recovery master set - forcing election\n"));
2482 force_election(rec, pnn, nodemap);
2487 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2488 * but we have, then force an election and try to become the new
2491 if (!ctdb_node_has_capabilities(rec->caps,
2493 CTDB_CAP_RECMASTER) &&
2494 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2495 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2497 (" Current recmaster node %u does not have CAP_RECMASTER,"
2498 " but we (node %u) have - force an election\n",
2499 rec->recmaster, pnn));
2500 force_election(rec, pnn, nodemap);
2504 /* Verify that the master node has not been deleted. This
2505 * should not happen because a node should always be shutdown
2506 * before being deleted, causing a new master to be elected
2507 * before now. However, if something strange has happened
2508 * then checking here will ensure we don't index beyond the
2509 * end of the nodemap array. */
2510 if (rec->recmaster >= nodemap->num) {
2512 ("Recmaster node %u has been deleted. Force election\n",
2514 force_election(rec, pnn, nodemap);
2518 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2519 if (nodemap->nodes[rec->recmaster].flags &
2520 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2522 ("Recmaster node %u is disconnected/deleted. Force election\n",
2524 force_election(rec, pnn, nodemap);
2528 /* get nodemap from the recovery master to check if it is inactive */
2529 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2530 mem_ctx, &recmaster_nodemap);
2534 " Unable to get nodemap from recovery master %u\n",
2536 /* No election, just error */
2541 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2542 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2544 ("Recmaster node %u is inactive. Force election\n",
2547 * update our nodemap to carry the recmaster's notion of
2548 * its own flags, so that we don't keep freezing the
2549 * inactive recmaster node...
2551 nodemap->nodes[rec->recmaster].flags =
2552 recmaster_nodemap->nodes[rec->recmaster].flags;
2553 force_election(rec, pnn, nodemap);
2560 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2561 TALLOC_CTX *mem_ctx)
2564 struct ctdb_node_map_old *nodemap=NULL;
2565 struct ctdb_node_map_old **remote_nodemaps=NULL;
2566 struct ctdb_vnn_map *vnnmap=NULL;
2567 struct ctdb_vnn_map *remote_vnnmap=NULL;
2568 uint32_t num_lmasters;
2569 int32_t debug_level;
2574 /* verify that the main daemon is still running */
2575 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2576 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2580 /* ping the local daemon to tell it we are alive */
2581 ctdb_ctrl_recd_ping(ctdb);
2583 if (rec->election_timeout) {
2584 /* an election is in progress */
2588 /* read the debug level from the parent and update locally */
2589 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2591 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2594 DEBUGLEVEL = debug_level;
2596 /* get relevant tunables */
2597 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2599 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2604 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2605 CTDB_CURRENT_NODE, &ctdb->runstate);
2607 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2611 pnn = ctdb_get_pnn(ctdb);
2614 TALLOC_FREE(rec->nodemap);
2615 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2617 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2620 nodemap = rec->nodemap;
2622 /* remember our own node flags */
2623 rec->node_flags = nodemap->nodes[pnn].flags;
2625 ban_misbehaving_nodes(rec, &self_ban);
2627 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2631 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2632 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2634 D_ERR("Failed to read recmode from local node\n");
2638 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2639 also frozen and that the recmode is set to active.
2641 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2642 /* If this node has become inactive then we want to
2643 * reduce the chances of it taking over the recovery
2644 * master role when it becomes active again. This
2645 * helps to stabilise the recovery master role so that
2646 * it stays on the most stable node.
2648 rec->priority_time = timeval_current();
2650 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2651 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2653 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2655 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2660 if (! rec->frozen_on_inactive) {
2661 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2665 (__location__ " Failed to freeze node "
2666 "in STOPPED or BANNED state\n"));
2670 rec->frozen_on_inactive = true;
2673 /* If this node is stopped or banned then it is not the recovery
2674 * master, so don't do anything. This prevents stopped or banned
2675 * node from starting election and sending unnecessary controls.
2680 rec->frozen_on_inactive = false;
2682 /* Retrieve capabilities from all connected nodes */
2683 ret = update_capabilities(rec, nodemap);
2685 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2689 if (! validate_recovery_master(rec, mem_ctx)) {
2693 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2694 /* Check if an IP takeover run is needed and trigger one if
2696 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2699 /* if we are not the recmaster then we do not need to check
2700 if recovery is needed
2702 if (pnn != rec->recmaster) {
2707 /* ensure our local copies of flags are right */
2708 ret = update_local_flags(rec, nodemap);
2710 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2714 if (ctdb->num_nodes != nodemap->num) {
2715 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2716 ctdb_load_nodes_file(ctdb);
2720 /* verify that all active nodes agree that we are the recmaster */
2721 switch (verify_recmaster(rec, nodemap, pnn)) {
2722 case MONITOR_RECOVERY_NEEDED:
2723 /* can not happen */
2725 case MONITOR_ELECTION_NEEDED:
2726 force_election(rec, pnn, nodemap);
2730 case MONITOR_FAILED:
2735 /* get the vnnmap */
2736 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2738 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2742 if (rec->need_recovery) {
2743 /* a previous recovery didn't finish */
2744 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2748 /* verify that all active nodes are in normal mode
2749 and not in recovery mode
2751 switch (verify_recmode(ctdb, nodemap)) {
2752 case MONITOR_RECOVERY_NEEDED:
2753 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2755 case MONITOR_FAILED:
2757 case MONITOR_ELECTION_NEEDED:
2758 /* can not happen */
2764 if (ctdb->recovery_lock != NULL) {
2765 /* We must already hold the recovery lock */
2766 if (!ctdb_recovery_have_lock(rec)) {
2767 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2768 ctdb_set_culprit(rec, ctdb->pnn);
2769 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2775 /* If recoveries are disabled then there is no use doing any
2776 * nodemap or flags checks. Recoveries might be disabled due
2777 * to "reloadnodes", so doing these checks might cause an
2778 * unnecessary recovery. */
2779 if (ctdb_op_is_disabled(rec->recovery)) {
2780 goto takeover_run_checks;
2783 /* get the nodemap for all active remote nodes
2785 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2786 if (remote_nodemaps == NULL) {
2787 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2790 for(i=0; i<nodemap->num; i++) {
2791 remote_nodemaps[i] = NULL;
2793 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2794 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2798 /* verify that all other nodes have the same nodemap as we have
2800 for (j=0; j<nodemap->num; j++) {
2801 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2805 if (remote_nodemaps[j] == NULL) {
2806 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2807 ctdb_set_culprit(rec, j);
2812 /* if the nodes disagree on how many nodes there are
2813 then this is a good reason to try recovery
2815 if (remote_nodemaps[j]->num != nodemap->num) {
2816 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2817 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2818 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2819 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2823 /* if the nodes disagree on which nodes exist and are
2824 active, then that is also a good reason to do recovery
2826 for (i=0;i<nodemap->num;i++) {
2827 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2828 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2829 nodemap->nodes[j].pnn, i,
2830 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2831 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2832 do_recovery(rec, mem_ctx, pnn, nodemap,
2840 * Update node flags obtained from each active node. This ensure we have
2841 * up-to-date information for all the nodes.
2843 for (j=0; j<nodemap->num; j++) {
2844 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2847 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2850 for (j=0; j<nodemap->num; j++) {
2851 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2855 /* verify the flags are consistent
2857 for (i=0; i<nodemap->num; i++) {
2858 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2862 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2863 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2864 nodemap->nodes[j].pnn,
2865 nodemap->nodes[i].pnn,
2866 remote_nodemaps[j]->nodes[i].flags,
2867 nodemap->nodes[i].flags));
2869 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2870 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2871 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2872 do_recovery(rec, mem_ctx, pnn, nodemap,
2876 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2877 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2878 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2879 do_recovery(rec, mem_ctx, pnn, nodemap,
2888 /* count how many active nodes there are */
2890 for (i=0; i<nodemap->num; i++) {
2891 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2892 if (ctdb_node_has_capabilities(rec->caps,
2893 ctdb->nodes[i]->pnn,
2894 CTDB_CAP_LMASTER)) {
2901 /* There must be the same number of lmasters in the vnn map as
2902 * there are active nodes with the lmaster capability... or
2905 if (vnnmap->size != num_lmasters) {
2906 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2907 vnnmap->size, num_lmasters));
2908 ctdb_set_culprit(rec, ctdb->pnn);
2909 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2913 /* verify that all active nodes in the nodemap also exist in
2916 for (j=0; j<nodemap->num; j++) {
2917 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2920 if (nodemap->nodes[j].pnn == pnn) {
2924 for (i=0; i<vnnmap->size; i++) {
2925 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2929 if (i == vnnmap->size) {
2930 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2931 nodemap->nodes[j].pnn));
2932 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2933 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2939 /* verify that all other nodes have the same vnnmap
2940 and are from the same generation
2942 for (j=0; j<nodemap->num; j++) {
2943 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2946 if (nodemap->nodes[j].pnn == pnn) {
2950 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2951 mem_ctx, &remote_vnnmap);
2953 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2954 nodemap->nodes[j].pnn));
2958 /* verify the vnnmap generation is the same */
2959 if (vnnmap->generation != remote_vnnmap->generation) {
2960 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2961 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2962 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2963 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2967 /* verify the vnnmap size is the same */
2968 if (vnnmap->size != remote_vnnmap->size) {
2969 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2970 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2971 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2972 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2976 /* verify the vnnmap is the same */
2977 for (i=0;i<vnnmap->size;i++) {
2978 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2979 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2980 nodemap->nodes[j].pnn));
2981 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2982 do_recovery(rec, mem_ctx, pnn, nodemap,
2989 /* FIXME: Add remote public IP checking to ensure that nodes
2990 * have the IP addresses that are allocated to them. */
2992 takeover_run_checks:
2994 /* If there are IP takeover runs requested or the previous one
2995 * failed then perform one and notify the waiters */
2996 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2997 (rec->reallocate_requests || rec->need_takeover_run)) {
2998 process_ipreallocate_requests(ctdb, rec);
3002 static void recd_sig_term_handler(struct tevent_context *ev,
3003 struct tevent_signal *se, int signum,
3004 int count, void *dont_care,
3007 struct ctdb_recoverd *rec = talloc_get_type_abort(
3008 private_data, struct ctdb_recoverd);
3010 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3011 ctdb_recovery_unlock(rec);
3017 the main monitoring loop
3019 static void monitor_cluster(struct ctdb_context *ctdb)
3021 struct tevent_signal *se;
3022 struct ctdb_recoverd *rec;
3024 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3026 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3027 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3030 rec->recmaster = CTDB_UNKNOWN_PNN;
3031 rec->recovery_lock_handle = NULL;
3033 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3034 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3036 rec->recovery = ctdb_op_init(rec, "recoveries");
3037 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3039 rec->priority_time = timeval_current();
3040 rec->frozen_on_inactive = false;
3042 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3043 recd_sig_term_handler, rec);
3045 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3049 /* register a message port for sending memory dumps */
3050 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3052 /* when a node is assigned banning credits */
3053 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3054 banning_handler, rec);
3056 /* register a message port for recovery elections */
3057 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3059 /* when nodes are disabled/enabled */
3060 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3062 /* when we are asked to puch out a flag change */
3063 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3065 /* register a message port for vacuum fetch */
3066 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3068 /* register a message port for reloadnodes */
3069 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3071 /* register a message port for performing a takeover run */
3072 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3074 /* register a message port for disabling the ip check for a short while */
3075 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3077 /* register a message port for forcing a rebalance of a node next
3079 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3081 /* Register a message port for disabling takeover runs */
3082 ctdb_client_set_message_handler(ctdb,
3083 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3084 disable_takeover_runs_handler, rec);
3086 /* Register a message port for disabling recoveries */
3087 ctdb_client_set_message_handler(ctdb,
3088 CTDB_SRVID_DISABLE_RECOVERIES,
3089 disable_recoveries_handler, rec);
3091 /* register a message port for detaching database */
3092 ctdb_client_set_message_handler(ctdb,
3093 CTDB_SRVID_DETACH_DATABASE,
3094 detach_database_handler, rec);
3097 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3098 struct timeval start;
3102 DEBUG(DEBUG_CRIT,(__location__
3103 " Failed to create temp context\n"));
3107 start = timeval_current();
3108 main_loop(ctdb, rec, mem_ctx);
3109 talloc_free(mem_ctx);
3111 /* we only check for recovery once every second */
3112 elapsed = timeval_elapsed(&start);
3113 if (elapsed < ctdb->tunable.recover_interval) {
3114 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3121 event handler for when the main ctdbd dies
3123 static void ctdb_recoverd_parent(struct tevent_context *ev,
3124 struct tevent_fd *fde,
3125 uint16_t flags, void *private_data)
3127 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3132 called regularly to verify that the recovery daemon is still running
3134 static void ctdb_check_recd(struct tevent_context *ev,
3135 struct tevent_timer *te,
3136 struct timeval yt, void *p)
3138 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3140 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3141 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3143 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3144 ctdb_restart_recd, ctdb);
3149 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3150 timeval_current_ofs(30, 0),
3151 ctdb_check_recd, ctdb);
3154 static void recd_sig_child_handler(struct tevent_context *ev,
3155 struct tevent_signal *se, int signum,
3156 int count, void *dont_care,
3159 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3164 pid = waitpid(-1, &status, WNOHANG);
3166 if (errno != ECHILD) {
3167 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3172 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3178 startup the recovery daemon as a child of the main ctdb daemon
3180 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3183 struct tevent_signal *se;
3184 struct tevent_fd *fde;
3187 if (pipe(fd) != 0) {
3191 ctdb->recoverd_pid = ctdb_fork(ctdb);
3192 if (ctdb->recoverd_pid == -1) {
3196 if (ctdb->recoverd_pid != 0) {
3197 talloc_free(ctdb->recd_ctx);
3198 ctdb->recd_ctx = talloc_new(ctdb);
3199 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3202 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3203 timeval_current_ofs(30, 0),
3204 ctdb_check_recd, ctdb);
3210 srandom(getpid() ^ time(NULL));
3212 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3217 prctl_set_comment("ctdb_recoverd");
3218 if (switch_from_server_to_client(ctdb) != 0) {
3219 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3223 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3225 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3226 ctdb_recoverd_parent, &fd[0]);
3227 tevent_fd_set_auto_close(fde);
3229 /* set up a handler to pick up sigchld */
3230 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3231 recd_sig_child_handler, ctdb);
3233 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3237 monitor_cluster(ctdb);
3239 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3244 shutdown the recovery daemon
3246 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3248 if (ctdb->recoverd_pid == 0) {
3252 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3253 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3255 TALLOC_FREE(ctdb->recd_ctx);
3256 TALLOC_FREE(ctdb->recd_ping_count);
3259 static void ctdb_restart_recd(struct tevent_context *ev,
3260 struct tevent_timer *te,
3261 struct timeval t, void *private_data)
3263 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3265 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3266 ctdb_stop_recoverd(ctdb);
3267 ctdb_start_recoverd(ctdb);