4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
89 if (*requests == NULL) {
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
116 t = talloc_zero(*requests, struct srvid_list);
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
153 state->in_progress = false;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
169 ("Unable to begin - %s are disabled\n", state->name));
173 state->in_progress = true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
213 if (state->in_progress) {
215 ("Unable to disable %s - in progress\n", state->name));
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
237 struct ctdb_banning_state {
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
423 talloc_free(tmp_ctx);
428 ensure all other nodes have attached to any databases that we have
430 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
431 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
434 struct ctdb_dbid_map_old *remote_dbmap;
436 /* verify that all other nodes have all our databases */
437 for (j=0; j<nodemap->num; j++) {
438 /* we don't need to ourself ourselves */
439 if (nodemap->nodes[j].pnn == pnn) {
442 /* don't check nodes that are unavailable */
443 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
447 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
448 mem_ctx, &remote_dbmap);
450 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
454 /* step through all local databases */
455 for (db=0; db<dbmap->num;db++) {
459 for (i=0;i<remote_dbmap->num;i++) {
460 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
464 /* the remote node already have this database */
465 if (i!=remote_dbmap->num) {
468 /* ok so we need to create this database */
469 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
470 dbmap->dbs[db].db_id, mem_ctx,
473 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
476 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
477 nodemap->nodes[j].pnn,
479 dbmap->dbs[db].flags, NULL);
481 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
492 ensure we are attached to any databases that anyone else is attached to
494 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
495 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
498 struct ctdb_dbid_map_old *remote_dbmap;
500 /* verify that we have all database any other node has */
501 for (j=0; j<nodemap->num; j++) {
502 /* we don't need to ourself ourselves */
503 if (nodemap->nodes[j].pnn == pnn) {
506 /* don't check nodes that are unavailable */
507 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
511 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
512 mem_ctx, &remote_dbmap);
514 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
518 /* step through all databases on the remote node */
519 for (db=0; db<remote_dbmap->num;db++) {
522 for (i=0;i<(*dbmap)->num;i++) {
523 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
527 /* we already have this db locally */
528 if (i!=(*dbmap)->num) {
531 /* ok so we need to create this database and
534 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
535 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
537 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
538 nodemap->nodes[j].pnn));
541 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
543 remote_dbmap->dbs[db].flags, NULL);
545 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
548 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
550 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
560 update flags on all active nodes
562 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
566 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
568 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
576 called when a vacuum fetch has completed - just free it and do the next one
578 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
585 * Process one elements of the vacuum fetch list:
586 * Migrate it over to us with the special flag
587 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
589 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
591 struct ctdb_rec_data_old *r)
593 struct ctdb_client_call_state *state;
595 struct ctdb_ltdb_header *hdr;
596 struct ctdb_call call;
599 call.call_id = CTDB_NULL_FUNC;
600 call.flags = CTDB_IMMEDIATE_MIGRATION;
601 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
603 call.key.dptr = &r->data[0];
604 call.key.dsize = r->keylen;
606 /* ensure we don't block this daemon - just skip a record if we can't get
608 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
612 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
613 if (data.dptr == NULL) {
614 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
618 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
620 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
624 hdr = (struct ctdb_ltdb_header *)data.dptr;
625 if (hdr->dmaster == pnn) {
626 /* its already local */
628 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
634 state = ctdb_call_send(ctdb_db, &call);
635 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
637 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
640 state->async.fn = vacuum_fetch_callback;
641 state->async.private_data = NULL;
648 handler for vacuum fetch
650 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
653 struct ctdb_recoverd *rec = talloc_get_type(
654 private_data, struct ctdb_recoverd);
655 struct ctdb_context *ctdb = rec->ctdb;
656 struct ctdb_marshall_buffer *recs;
658 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
660 struct ctdb_dbid_map_old *dbmap=NULL;
661 uint8_t db_flags = 0;
662 struct ctdb_db_context *ctdb_db;
663 struct ctdb_rec_data_old *r;
665 recs = (struct ctdb_marshall_buffer *)data.dptr;
667 if (recs->count == 0) {
671 /* work out if the database is persistent */
672 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
674 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
678 for (i=0;i<dbmap->num;i++) {
679 if (dbmap->dbs[i].db_id == recs->db_id) {
680 db_flags = dbmap->dbs[i].flags;
684 if (i == dbmap->num) {
685 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
689 /* find the name of this database */
690 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
691 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
696 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
697 if (ctdb_db == NULL) {
698 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
702 r = (struct ctdb_rec_data_old *)&recs->data[0];
703 while (recs->count) {
706 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
711 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
716 talloc_free(tmp_ctx);
721 * handler for database detach
723 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
726 struct ctdb_recoverd *rec = talloc_get_type(
727 private_data, struct ctdb_recoverd);
728 struct ctdb_context *ctdb = rec->ctdb;
730 struct ctdb_db_context *ctdb_db;
732 if (data.dsize != sizeof(db_id)) {
735 db_id = *(uint32_t *)data.dptr;
737 ctdb_db = find_ctdb_db(ctdb, db_id);
738 if (ctdb_db == NULL) {
739 /* database is not attached */
743 DLIST_REMOVE(ctdb->db_list, ctdb_db);
745 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
747 talloc_free(ctdb_db);
751 called when ctdb_wait_timeout should finish
753 static void ctdb_wait_handler(struct tevent_context *ev,
754 struct tevent_timer *te,
755 struct timeval yt, void *p)
757 uint32_t *timed_out = (uint32_t *)p;
762 wait for a given number of seconds
764 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
766 uint32_t timed_out = 0;
767 time_t usecs = (secs - (time_t)secs) * 1000000;
768 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
769 ctdb_wait_handler, &timed_out);
771 tevent_loop_once(ctdb->ev);
776 called when an election times out (ends)
778 static void ctdb_election_timeout(struct tevent_context *ev,
779 struct tevent_timer *te,
780 struct timeval t, void *p)
782 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
783 rec->election_timeout = NULL;
786 DEBUG(DEBUG_WARNING,("Election period ended\n"));
791 wait for an election to finish. It finished election_timeout seconds after
792 the last election packet is received
794 static void ctdb_wait_election(struct ctdb_recoverd *rec)
796 struct ctdb_context *ctdb = rec->ctdb;
797 while (rec->election_timeout) {
798 tevent_loop_once(ctdb->ev);
803 Update our local flags from all remote connected nodes.
804 This is only run when we are or we belive we are the recovery master
806 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
809 struct ctdb_context *ctdb = rec->ctdb;
810 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
812 /* get the nodemap for all active remote nodes and verify
813 they are the same as for this node
815 for (j=0; j<nodemap->num; j++) {
816 struct ctdb_node_map_old *remote_nodemap=NULL;
819 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
822 if (nodemap->nodes[j].pnn == ctdb->pnn) {
826 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
827 mem_ctx, &remote_nodemap);
829 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
830 nodemap->nodes[j].pnn));
831 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
832 talloc_free(mem_ctx);
835 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
836 /* We should tell our daemon about this so it
837 updates its flags or else we will log the same
838 message again in the next iteration of recovery.
839 Since we are the recovery master we can just as
840 well update the flags on all nodes.
842 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
844 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
848 /* Update our local copy of the flags in the recovery
851 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
852 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
853 nodemap->nodes[j].flags));
854 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
856 talloc_free(remote_nodemap);
858 talloc_free(mem_ctx);
863 /* Create a new random generation id.
864 The generation id can not be the INVALID_GENERATION id
866 static uint32_t new_generation(void)
871 generation = random();
873 if (generation != INVALID_GENERATION) {
881 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
883 return (rec->recovery_lock_handle != NULL);
886 struct ctdb_recovery_lock_handle {
890 struct ctdb_cluster_mutex_handle *h;
893 static void take_reclock_handler(char status,
897 struct ctdb_recovery_lock_handle *s =
898 (struct ctdb_recovery_lock_handle *) private_data;
900 s->locked = (status == '0') ;
903 * If unsuccessful then ensure the process has exited and that
904 * the file descriptor event handler has been cancelled
912 s->latency = latency;
917 ("Unable to take recovery lock - contention\n"));
921 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
927 static void force_election(struct ctdb_recoverd *rec,
929 struct ctdb_node_map_old *nodemap);
931 static void lost_reclock_handler(void *private_data)
933 struct ctdb_recoverd *rec = talloc_get_type_abort(
934 private_data, struct ctdb_recoverd);
936 D_ERR("Recovery lock helper terminated, triggering an election\n");
937 TALLOC_FREE(rec->recovery_lock_handle);
939 force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
942 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
944 struct ctdb_context *ctdb = rec->ctdb;
945 struct ctdb_cluster_mutex_handle *h;
946 struct ctdb_recovery_lock_handle *s;
948 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
950 DBG_ERR("Memory allocation error\n");
954 h = ctdb_cluster_mutex(s,
958 take_reclock_handler,
960 lost_reclock_handler,
967 rec->recovery_lock_handle = s;
971 tevent_loop_once(ctdb->ev);
975 TALLOC_FREE(rec->recovery_lock_handle);
979 ctdb_ctrl_report_recd_lock_latency(ctdb,
986 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
988 if (rec->recovery_lock_handle == NULL) {
992 if (! rec->recovery_lock_handle->done) {
994 * Taking of recovery lock still in progress. Free
995 * the cluster mutex handle to release it but leave
996 * the recovery lock handle in place to allow taking
997 * of the lock to fail.
999 D_NOTICE("Cancelling recovery lock\n");
1000 TALLOC_FREE(rec->recovery_lock_handle->h);
1001 rec->recovery_lock_handle->done = true;
1002 rec->recovery_lock_handle->locked = false;
1006 D_NOTICE("Releasing recovery lock\n");
1007 TALLOC_FREE(rec->recovery_lock_handle);
1010 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1012 struct ctdb_context *ctdb = rec->ctdb;
1014 struct ctdb_banning_state *ban_state;
1017 for (i=0; i<ctdb->num_nodes; i++) {
1018 if (ctdb->nodes[i]->ban_state == NULL) {
1021 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1022 if (ban_state->count < 2*ctdb->num_nodes) {
1026 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1027 ctdb->nodes[i]->pnn, ban_state->count,
1028 ctdb->tunable.recovery_ban_period));
1029 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1030 ban_state->count = 0;
1032 /* Banning ourself? */
1033 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1039 struct helper_state {
1046 static void helper_handler(struct tevent_context *ev,
1047 struct tevent_fd *fde,
1048 uint16_t flags, void *private_data)
1050 struct helper_state *state = talloc_get_type_abort(
1051 private_data, struct helper_state);
1054 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1055 if (ret != sizeof(state->result)) {
1056 state->result = EPIPE;
1062 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1063 const char *prog, const char *arg, const char *type)
1065 struct helper_state *state;
1066 struct tevent_fd *fde;
1069 uint32_t recmaster = rec->recmaster;
1071 state = talloc_zero(mem_ctx, struct helper_state);
1072 if (state == NULL) {
1073 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1079 ret = pipe(state->fd);
1082 ("Failed to create pipe for %s helper\n", type));
1086 set_close_on_exec(state->fd[0]);
1089 args = talloc_array(state, const char *, nargs);
1091 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1095 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1096 if (args[0] == NULL) {
1097 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1100 args[1] = rec->ctdb->daemon.name;
1104 if (args[2] == NULL) {
1108 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1109 if (state->pid == -1) {
1111 ("Failed to create child for %s helper\n", type));
1115 close(state->fd[1]);
1118 state->done = false;
1120 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1121 TEVENT_FD_READ, helper_handler, state);
1125 tevent_fd_set_auto_close(fde);
1127 while (!state->done) {
1128 tevent_loop_once(rec->ctdb->ev);
1130 /* If recmaster changes, we have lost election */
1131 if (recmaster != rec->recmaster) {
1132 D_ERR("Recmaster changed to %u, aborting %s\n",
1133 rec->recmaster, type);
1139 close(state->fd[0]);
1142 if (state->result != 0) {
1146 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1151 if (state->fd[0] != -1) {
1152 close(state->fd[0]);
1154 if (state->fd[1] != -1) {
1155 close(state->fd[1]);
1157 if (state->pid != -1) {
1158 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1165 static int ctdb_takeover(struct ctdb_recoverd *rec,
1166 uint32_t *force_rebalance_nodes)
1168 static char prog[PATH_MAX+1] = "";
1172 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1173 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1174 "ctdb_takeover_helper")) {
1175 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1179 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1180 uint32_t pnn = force_rebalance_nodes[i];
1182 arg = talloc_asprintf(rec, "%u", pnn);
1184 arg = talloc_asprintf_append(arg, ",%u", pnn);
1187 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1192 if (ctdb_config.failover_disabled) {
1193 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1195 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1200 return helper_run(rec, rec, prog, arg, "takeover");
1203 static bool do_takeover_run(struct ctdb_recoverd *rec,
1204 struct ctdb_node_map_old *nodemap)
1206 uint32_t *nodes = NULL;
1207 struct ctdb_disable_message dtr;
1210 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1214 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1216 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1217 DEBUG(DEBUG_ERR, (__location__
1218 " takeover run already in progress \n"));
1223 if (!ctdb_op_begin(rec->takeover_run)) {
1228 /* Disable IP checks (takeover runs, really) on other nodes
1229 * while doing this takeover run. This will stop those other
1230 * nodes from triggering takeover runs when think they should
1231 * be hosting an IP but it isn't yet on an interface. Don't
1232 * wait for replies since a failure here might cause some
1233 * noise in the logs but will not actually cause a problem.
1236 dtr.srvid = 0; /* No reply */
1239 data.dptr = (uint8_t*)&dtr;
1240 data.dsize = sizeof(dtr);
1242 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1244 /* Disable for 60 seconds. This can be a tunable later if
1248 for (i = 0; i < talloc_array_length(nodes); i++) {
1249 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1250 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1252 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1256 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1258 /* Reenable takeover runs and IP checks on other nodes */
1260 for (i = 0; i < talloc_array_length(nodes); i++) {
1261 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1262 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1264 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1269 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1275 /* Takeover run was successful so clear force rebalance targets */
1276 if (rebalance_nodes == rec->force_rebalance_nodes) {
1277 TALLOC_FREE(rec->force_rebalance_nodes);
1279 DEBUG(DEBUG_WARNING,
1280 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1283 rec->need_takeover_run = !ok;
1285 ctdb_op_end(rec->takeover_run);
1287 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1291 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1293 static char prog[PATH_MAX+1] = "";
1296 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1297 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1298 "ctdb_recovery_helper")) {
1299 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1302 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1304 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1308 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1310 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1314 we are the recmaster, and recovery is needed - start a recovery run
1316 static int do_recovery(struct ctdb_recoverd *rec,
1317 TALLOC_CTX *mem_ctx, uint32_t pnn,
1318 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1320 struct ctdb_context *ctdb = rec->ctdb;
1322 struct ctdb_dbid_map_old *dbmap;
1325 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1327 /* Check if the current node is still the recmaster. It's possible that
1328 * re-election has changed the recmaster.
1330 if (pnn != rec->recmaster) {
1332 ("Recovery master changed to %u, aborting recovery\n",
1337 /* if recovery fails, force it again */
1338 rec->need_recovery = true;
1340 if (!ctdb_op_begin(rec->recovery)) {
1344 if (rec->election_timeout) {
1345 /* an election is in progress */
1346 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1350 ban_misbehaving_nodes(rec, &self_ban);
1352 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1356 if (ctdb->recovery_lock != NULL) {
1357 if (ctdb_recovery_have_lock(rec)) {
1358 D_NOTICE("Already holding recovery lock\n");
1362 D_NOTICE("Attempting to take recovery lock (%s)\n",
1363 ctdb->recovery_lock);
1365 ok = ctdb_recovery_lock(rec);
1367 D_ERR("Unable to take recovery lock\n");
1369 if (pnn != rec->recmaster) {
1370 D_NOTICE("Recovery master changed to %u,"
1371 " aborting recovery\n",
1373 rec->need_recovery = false;
1377 if (ctdb->runstate ==
1378 CTDB_RUNSTATE_FIRST_RECOVERY) {
1380 * First recovery? Perhaps
1381 * current node does not yet
1382 * know who the recmaster is.
1384 D_ERR("Retrying recovery\n");
1388 D_ERR("Abort recovery, "
1389 "ban this node for %u seconds\n",
1390 ctdb->tunable.recovery_ban_period);
1393 ctdb->tunable.recovery_ban_period);
1396 D_NOTICE("Recovery lock taken successfully\n");
1400 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1402 /* get a list of all databases */
1403 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1405 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1409 /* we do the db creation before we set the recovery mode, so the freeze happens
1410 on all databases we will be dealing with. */
1412 /* verify that we have all the databases any other node has */
1413 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1415 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1419 /* verify that all other nodes have all our databases */
1420 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1422 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1425 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1428 /* Retrieve capabilities from all connected nodes */
1429 ret = update_capabilities(rec, nodemap);
1431 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1436 update all nodes to have the same flags that we have
1438 for (i=0;i<nodemap->num;i++) {
1439 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1443 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1445 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1446 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1448 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1454 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1456 ret = db_recovery_parallel(rec, mem_ctx);
1461 do_takeover_run(rec, nodemap);
1463 /* send a message to all clients telling them that the cluster
1464 has been reconfigured */
1465 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1466 CTDB_SRVID_RECONFIGURE, tdb_null);
1468 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1472 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1474 rec->need_recovery = false;
1475 ctdb_op_end(rec->recovery);
1477 /* we managed to complete a full recovery, make sure to forgive
1478 any past sins by the nodes that could now participate in the
1481 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1482 for (i=0;i<nodemap->num;i++) {
1483 struct ctdb_banning_state *ban_state;
1485 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1489 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1490 if (ban_state == NULL) {
1494 ban_state->count = 0;
1497 /* We just finished a recovery successfully.
1498 We now wait for rerecovery_timeout before we allow
1499 another recovery to take place.
1501 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1502 ctdb_op_disable(rec->recovery, ctdb->ev,
1503 ctdb->tunable.rerecovery_timeout);
1507 ctdb_op_end(rec->recovery);
1513 elections are won by first checking the number of connected nodes, then
1514 the priority time, then the pnn
1516 struct election_message {
1517 uint32_t num_connected;
1518 struct timeval priority_time;
1520 uint32_t node_flags;
1524 form this nodes election data
1526 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1529 struct ctdb_node_map_old *nodemap;
1530 struct ctdb_context *ctdb = rec->ctdb;
1534 em->pnn = rec->ctdb->pnn;
1535 em->priority_time = rec->priority_time;
1537 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1539 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1543 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1544 em->node_flags = rec->node_flags;
1546 for (i=0;i<nodemap->num;i++) {
1547 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1548 em->num_connected++;
1552 /* we shouldnt try to win this election if we cant be a recmaster */
1553 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1554 em->num_connected = 0;
1555 em->priority_time = timeval_current();
1558 talloc_free(nodemap);
1562 see if the given election data wins
1564 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1566 struct election_message myem;
1569 ctdb_election_data(rec, &myem);
1571 /* we cant win if we don't have the recmaster capability */
1572 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1576 /* we cant win if we are banned */
1577 if (rec->node_flags & NODE_FLAGS_BANNED) {
1581 /* we cant win if we are stopped */
1582 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1586 /* we will automatically win if the other node is banned */
1587 if (em->node_flags & NODE_FLAGS_BANNED) {
1591 /* we will automatically win if the other node is banned */
1592 if (em->node_flags & NODE_FLAGS_STOPPED) {
1596 /* then the longest running node */
1598 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1602 cmp = (int)myem.pnn - (int)em->pnn;
1609 send out an election request
1611 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1614 TDB_DATA election_data;
1615 struct election_message emsg;
1617 struct ctdb_context *ctdb = rec->ctdb;
1619 srvid = CTDB_SRVID_ELECTION;
1621 ctdb_election_data(rec, &emsg);
1623 election_data.dsize = sizeof(struct election_message);
1624 election_data.dptr = (unsigned char *)&emsg;
1627 /* first we assume we will win the election and set
1628 recoverymaster to be ourself on the current node
1630 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1631 CTDB_CURRENT_NODE, pnn);
1633 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1636 rec->recmaster = pnn;
1638 /* send an election message to all active nodes */
1639 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1640 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1644 we think we are winning the election - send a broadcast election request
1646 static void election_send_request(struct tevent_context *ev,
1647 struct tevent_timer *te,
1648 struct timeval t, void *p)
1650 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1653 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1655 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1658 TALLOC_FREE(rec->send_election_te);
1662 handler for memory dumps
1664 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1666 struct ctdb_recoverd *rec = talloc_get_type(
1667 private_data, struct ctdb_recoverd);
1668 struct ctdb_context *ctdb = rec->ctdb;
1669 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1672 struct ctdb_srvid_message *rd;
1674 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1675 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1676 talloc_free(tmp_ctx);
1679 rd = (struct ctdb_srvid_message *)data.dptr;
1681 dump = talloc_zero(tmp_ctx, TDB_DATA);
1683 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1684 talloc_free(tmp_ctx);
1687 ret = ctdb_dump_memory(ctdb, dump);
1689 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1690 talloc_free(tmp_ctx);
1694 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1696 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1698 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1699 talloc_free(tmp_ctx);
1703 talloc_free(tmp_ctx);
1707 handler for reload_nodes
1709 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1712 struct ctdb_recoverd *rec = talloc_get_type(
1713 private_data, struct ctdb_recoverd);
1715 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1717 ctdb_load_nodes_file(rec->ctdb);
1721 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1724 struct ctdb_recoverd *rec = talloc_get_type(
1725 private_data, struct ctdb_recoverd);
1726 struct ctdb_context *ctdb = rec->ctdb;
1731 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1735 if (data.dsize != sizeof(uint32_t)) {
1736 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1740 pnn = *(uint32_t *)&data.dptr[0];
1742 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1744 /* Copy any existing list of nodes. There's probably some
1745 * sort of realloc variant that will do this but we need to
1746 * make sure that freeing the old array also cancels the timer
1747 * event for the timeout... not sure if realloc will do that.
1749 len = (rec->force_rebalance_nodes != NULL) ?
1750 talloc_array_length(rec->force_rebalance_nodes) :
1753 /* This allows duplicates to be added but they don't cause
1754 * harm. A call to add a duplicate PNN arguably means that
1755 * the timeout should be reset, so this is the simplest
1758 t = talloc_zero_array(rec, uint32_t, len+1);
1759 CTDB_NO_MEMORY_VOID(ctdb, t);
1761 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1765 talloc_free(rec->force_rebalance_nodes);
1767 rec->force_rebalance_nodes = t;
1772 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1774 struct ctdb_op_state *op_state)
1776 struct ctdb_disable_message *r;
1781 /* Validate input data */
1782 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1783 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1784 "expecting %lu\n", (long unsigned)data.dsize,
1785 (long unsigned)sizeof(struct ctdb_srvid_message)));
1788 if (data.dptr == NULL) {
1789 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1793 r = (struct ctdb_disable_message *)data.dptr;
1794 timeout = r->timeout;
1796 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1801 /* Returning our PNN tells the caller that we succeeded */
1802 ret = ctdb_get_pnn(ctdb);
1804 result.dsize = sizeof(int32_t);
1805 result.dptr = (uint8_t *)&ret;
1806 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1809 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1812 struct ctdb_recoverd *rec = talloc_get_type(
1813 private_data, struct ctdb_recoverd);
1815 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1818 /* Backward compatibility for this SRVID */
1819 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1822 struct ctdb_recoverd *rec = talloc_get_type(
1823 private_data, struct ctdb_recoverd);
1826 if (data.dsize != sizeof(uint32_t)) {
1827 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1828 "expecting %lu\n", (long unsigned)data.dsize,
1829 (long unsigned)sizeof(uint32_t)));
1832 if (data.dptr == NULL) {
1833 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1837 timeout = *((uint32_t *)data.dptr);
1839 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1842 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1845 struct ctdb_recoverd *rec = talloc_get_type(
1846 private_data, struct ctdb_recoverd);
1848 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1852 handler for ip reallocate, just add it to the list of requests and
1853 handle this later in the monitor_cluster loop so we do not recurse
1854 with other requests to takeover_run()
1856 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1859 struct ctdb_srvid_message *request;
1860 struct ctdb_recoverd *rec = talloc_get_type(
1861 private_data, struct ctdb_recoverd);
1863 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1864 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1868 request = (struct ctdb_srvid_message *)data.dptr;
1870 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1873 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1874 struct ctdb_recoverd *rec)
1878 struct srvid_requests *current;
1880 /* Only process requests that are currently pending. More
1881 * might come in while the takeover run is in progress and
1882 * they will need to be processed later since they might
1883 * be in response flag changes.
1885 current = rec->reallocate_requests;
1886 rec->reallocate_requests = NULL;
1888 if (do_takeover_run(rec, rec->nodemap)) {
1889 ret = ctdb_get_pnn(ctdb);
1894 result.dsize = sizeof(int32_t);
1895 result.dptr = (uint8_t *)&ret;
1897 srvid_requests_reply(ctdb, ¤t, result);
1901 * handler for assigning banning credits
1903 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1905 struct ctdb_recoverd *rec = talloc_get_type(
1906 private_data, struct ctdb_recoverd);
1909 /* Ignore if we are not recmaster */
1910 if (rec->ctdb->pnn != rec->recmaster) {
1914 if (data.dsize != sizeof(uint32_t)) {
1915 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1920 ban_pnn = *(uint32_t *)data.dptr;
1922 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1926 handler for recovery master elections
1928 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1930 struct ctdb_recoverd *rec = talloc_get_type(
1931 private_data, struct ctdb_recoverd);
1932 struct ctdb_context *ctdb = rec->ctdb;
1934 struct election_message *em = (struct election_message *)data.dptr;
1936 /* Ignore election packets from ourself */
1937 if (ctdb->pnn == em->pnn) {
1941 /* we got an election packet - update the timeout for the election */
1942 talloc_free(rec->election_timeout);
1943 rec->election_timeout = tevent_add_timer(
1946 timeval_current_ofs(0, 500000) :
1947 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1948 ctdb_election_timeout, rec);
1950 /* someone called an election. check their election data
1951 and if we disagree and we would rather be the elected node,
1952 send a new election message to all other nodes
1954 if (ctdb_election_win(rec, em)) {
1955 if (!rec->send_election_te) {
1956 rec->send_election_te = tevent_add_timer(
1958 timeval_current_ofs(0, 500000),
1959 election_send_request, rec);
1965 TALLOC_FREE(rec->send_election_te);
1967 /* Release the recovery lock file */
1968 if (ctdb_recovery_have_lock(rec)) {
1969 ctdb_recovery_unlock(rec);
1972 /* ok, let that guy become recmaster then */
1973 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1974 CTDB_CURRENT_NODE, em->pnn);
1976 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1979 rec->recmaster = em->pnn;
1986 force the start of the election process
1988 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1989 struct ctdb_node_map_old *nodemap)
1992 struct ctdb_context *ctdb = rec->ctdb;
1994 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1996 /* set all nodes to recovery mode to stop all internode traffic */
1997 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1999 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2003 talloc_free(rec->election_timeout);
2004 rec->election_timeout = tevent_add_timer(
2007 timeval_current_ofs(0, 500000) :
2008 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2009 ctdb_election_timeout, rec);
2011 ret = send_election_request(rec, pnn);
2013 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2017 /* wait for a few seconds to collect all responses */
2018 ctdb_wait_election(rec);
2024 handler for when a node changes its flags
2026 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2028 struct ctdb_recoverd *rec = talloc_get_type(
2029 private_data, struct ctdb_recoverd);
2030 struct ctdb_context *ctdb = rec->ctdb;
2032 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2033 struct ctdb_node_map_old *nodemap=NULL;
2034 TALLOC_CTX *tmp_ctx;
2037 if (data.dsize != sizeof(*c)) {
2038 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2042 tmp_ctx = talloc_new(ctdb);
2043 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2045 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2047 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2048 talloc_free(tmp_ctx);
2053 for (i=0;i<nodemap->num;i++) {
2054 if (nodemap->nodes[i].pnn == c->pnn) break;
2057 if (i == nodemap->num) {
2058 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2059 talloc_free(tmp_ctx);
2063 if (c->old_flags != c->new_flags) {
2064 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2067 nodemap->nodes[i].flags = c->new_flags;
2069 talloc_free(tmp_ctx);
2073 handler for when we need to push out flag changes ot all other nodes
2075 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2078 struct ctdb_recoverd *rec = talloc_get_type(
2079 private_data, struct ctdb_recoverd);
2080 struct ctdb_context *ctdb = rec->ctdb;
2082 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2083 struct ctdb_node_map_old *nodemap=NULL;
2084 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2087 /* read the node flags from the recmaster */
2088 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2091 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2092 talloc_free(tmp_ctx);
2095 if (c->pnn >= nodemap->num) {
2096 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2097 talloc_free(tmp_ctx);
2101 /* send the flags update to all connected nodes */
2102 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2104 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2105 nodes, 0, CONTROL_TIMEOUT(),
2109 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2111 talloc_free(tmp_ctx);
2115 talloc_free(tmp_ctx);
2119 struct verify_recmode_normal_data {
2121 enum monitor_result status;
2124 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2126 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2129 /* one more node has responded with recmode data*/
2132 /* if we failed to get the recmode, then return an error and let
2133 the main loop try again.
2135 if (state->state != CTDB_CONTROL_DONE) {
2136 if (rmdata->status == MONITOR_OK) {
2137 rmdata->status = MONITOR_FAILED;
2142 /* if we got a response, then the recmode will be stored in the
2145 if (state->status != CTDB_RECOVERY_NORMAL) {
2146 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2147 rmdata->status = MONITOR_RECOVERY_NEEDED;
2154 /* verify that all nodes are in normal recovery mode */
2155 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2157 struct verify_recmode_normal_data *rmdata;
2158 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2159 struct ctdb_client_control_state *state;
2160 enum monitor_result status;
2163 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2164 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2166 rmdata->status = MONITOR_OK;
2168 /* loop over all active nodes and send an async getrecmode call to
2170 for (j=0; j<nodemap->num; j++) {
2171 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2174 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2176 nodemap->nodes[j].pnn);
2177 if (state == NULL) {
2178 /* we failed to send the control, treat this as
2179 an error and try again next iteration
2181 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2182 talloc_free(mem_ctx);
2183 return MONITOR_FAILED;
2186 /* set up the callback functions */
2187 state->async.fn = verify_recmode_normal_callback;
2188 state->async.private_data = rmdata;
2190 /* one more control to wait for to complete */
2195 /* now wait for up to the maximum number of seconds allowed
2196 or until all nodes we expect a response from has replied
2198 while (rmdata->count > 0) {
2199 tevent_loop_once(ctdb->ev);
2202 status = rmdata->status;
2203 talloc_free(mem_ctx);
2208 struct verify_recmaster_data {
2209 struct ctdb_recoverd *rec;
2212 enum monitor_result status;
2215 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2217 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2220 /* one more node has responded with recmaster data*/
2223 /* if we failed to get the recmaster, then return an error and let
2224 the main loop try again.
2226 if (state->state != CTDB_CONTROL_DONE) {
2227 if (rmdata->status == MONITOR_OK) {
2228 rmdata->status = MONITOR_FAILED;
2233 /* if we got a response, then the recmaster will be stored in the
2236 if (state->status != rmdata->pnn) {
2237 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2238 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2239 rmdata->status = MONITOR_ELECTION_NEEDED;
2246 /* verify that all nodes agree that we are the recmaster */
2247 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2249 struct ctdb_context *ctdb = rec->ctdb;
2250 struct verify_recmaster_data *rmdata;
2251 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2252 struct ctdb_client_control_state *state;
2253 enum monitor_result status;
2256 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2257 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2261 rmdata->status = MONITOR_OK;
2263 /* loop over all active nodes and send an async getrecmaster call to
2265 for (j=0; j<nodemap->num; j++) {
2266 if (nodemap->nodes[j].pnn == rec->recmaster) {
2269 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2272 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2274 nodemap->nodes[j].pnn);
2275 if (state == NULL) {
2276 /* we failed to send the control, treat this as
2277 an error and try again next iteration
2279 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2280 talloc_free(mem_ctx);
2281 return MONITOR_FAILED;
2284 /* set up the callback functions */
2285 state->async.fn = verify_recmaster_callback;
2286 state->async.private_data = rmdata;
2288 /* one more control to wait for to complete */
2293 /* now wait for up to the maximum number of seconds allowed
2294 or until all nodes we expect a response from has replied
2296 while (rmdata->count > 0) {
2297 tevent_loop_once(ctdb->ev);
2300 status = rmdata->status;
2301 talloc_free(mem_ctx);
2305 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2306 struct ctdb_recoverd *rec)
2308 struct ctdb_iface_list_old *ifaces = NULL;
2309 TALLOC_CTX *mem_ctx;
2312 mem_ctx = talloc_new(NULL);
2314 /* Read the interfaces from the local node */
2315 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2316 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2317 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2318 /* We could return an error. However, this will be
2319 * rare so we'll decide that the interfaces have
2320 * actually changed, just in case.
2322 talloc_free(mem_ctx);
2327 /* We haven't been here before so things have changed */
2328 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2330 } else if (rec->ifaces->num != ifaces->num) {
2331 /* Number of interfaces has changed */
2332 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2333 rec->ifaces->num, ifaces->num));
2336 /* See if interface names or link states have changed */
2338 for (i = 0; i < rec->ifaces->num; i++) {
2339 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2340 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2342 ("Interface in slot %d changed: %s => %s\n",
2343 i, iface->name, ifaces->ifaces[i].name));
2347 if (iface->link_state != ifaces->ifaces[i].link_state) {
2349 ("Interface %s changed state: %d => %d\n",
2350 iface->name, iface->link_state,
2351 ifaces->ifaces[i].link_state));
2358 talloc_free(rec->ifaces);
2359 rec->ifaces = talloc_steal(rec, ifaces);
2361 talloc_free(mem_ctx);
2365 /* Check that the local allocation of public IP addresses is correct
2366 * and do some house-keeping */
2367 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2368 struct ctdb_recoverd *rec,
2370 struct ctdb_node_map_old *nodemap)
2372 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2374 bool need_takeover_run = false;
2375 struct ctdb_public_ip_list_old *ips = NULL;
2377 /* If we are not the recmaster then do some housekeeping */
2378 if (rec->recmaster != pnn) {
2379 /* Ignore any IP reallocate requests - only recmaster
2382 TALLOC_FREE(rec->reallocate_requests);
2383 /* Clear any nodes that should be force rebalanced in
2384 * the next takeover run. If the recovery master role
2385 * has moved then we don't want to process these some
2386 * time in the future.
2388 TALLOC_FREE(rec->force_rebalance_nodes);
2391 /* Return early if disabled... */
2392 if (ctdb_config.failover_disabled ||
2393 ctdb_op_is_disabled(rec->takeover_run)) {
2397 if (interfaces_have_changed(ctdb, rec)) {
2398 need_takeover_run = true;
2401 /* If there are unhosted IPs but this node can host them then
2402 * trigger an IP reallocation */
2404 /* Read *available* IPs from local node */
2405 ret = ctdb_ctrl_get_public_ips_flags(
2406 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2407 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2409 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2410 talloc_free(mem_ctx);
2414 for (j=0; j<ips->num; j++) {
2415 if (ips->ips[j].pnn == -1 &&
2416 nodemap->nodes[pnn].flags == 0) {
2417 DEBUG(DEBUG_WARNING,
2418 ("Unassigned IP %s can be served by this node\n",
2419 ctdb_addr_to_str(&ips->ips[j].addr)));
2420 need_takeover_run = true;
2426 if (!ctdb->do_checkpublicip) {
2430 /* Validate the IP addresses that this node has on network
2431 * interfaces. If there is an inconsistency between reality
2432 * and the state expected by CTDB then try to fix it by
2433 * triggering an IP reallocation or releasing extraneous IP
2436 /* Read *known* IPs from local node */
2437 ret = ctdb_ctrl_get_public_ips_flags(
2438 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2440 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2441 talloc_free(mem_ctx);
2445 for (j=0; j<ips->num; j++) {
2446 if (ips->ips[j].pnn == pnn) {
2447 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2449 ("Assigned IP %s not on an interface\n",
2450 ctdb_addr_to_str(&ips->ips[j].addr)));
2451 need_takeover_run = true;
2454 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2456 ("IP %s incorrectly on an interface\n",
2457 ctdb_addr_to_str(&ips->ips[j].addr)));
2458 need_takeover_run = true;
2464 if (need_takeover_run) {
2465 struct ctdb_srvid_message rd;
2468 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2473 data.dptr = (uint8_t *)&rd;
2474 data.dsize = sizeof(rd);
2476 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2479 ("Failed to send takeover run request\n"));
2482 talloc_free(mem_ctx);
2487 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2489 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2491 if (node_pnn >= ctdb->num_nodes) {
2492 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2496 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2500 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2501 struct ctdb_node_map_old *nodemap,
2502 struct ctdb_node_map_old **remote_nodemaps)
2506 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2507 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2509 CONTROL_TIMEOUT(), false, tdb_null,
2510 async_getnodemap_callback,
2512 remote_nodemaps) != 0) {
2513 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2521 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2522 TALLOC_CTX *mem_ctx)
2524 struct ctdb_context *ctdb = rec->ctdb;
2525 uint32_t pnn = ctdb_get_pnn(ctdb);
2526 struct ctdb_node_map_old *nodemap = rec->nodemap;
2527 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2530 /* When recovery daemon is started, recmaster is set to
2531 * "unknown" so it knows to start an election.
2533 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2535 ("Initial recovery master set - forcing election\n"));
2536 force_election(rec, pnn, nodemap);
2541 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2542 * but we have, then force an election and try to become the new
2545 if (!ctdb_node_has_capabilities(rec->caps,
2547 CTDB_CAP_RECMASTER) &&
2548 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2549 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2551 (" Current recmaster node %u does not have CAP_RECMASTER,"
2552 " but we (node %u) have - force an election\n",
2553 rec->recmaster, pnn));
2554 force_election(rec, pnn, nodemap);
2558 /* Verify that the master node has not been deleted. This
2559 * should not happen because a node should always be shutdown
2560 * before being deleted, causing a new master to be elected
2561 * before now. However, if something strange has happened
2562 * then checking here will ensure we don't index beyond the
2563 * end of the nodemap array. */
2564 if (rec->recmaster >= nodemap->num) {
2566 ("Recmaster node %u has been deleted. Force election\n",
2568 force_election(rec, pnn, nodemap);
2572 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2573 if (nodemap->nodes[rec->recmaster].flags &
2574 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2576 ("Recmaster node %u is disconnected/deleted. Force election\n",
2578 force_election(rec, pnn, nodemap);
2582 /* get nodemap from the recovery master to check if it is inactive */
2583 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2584 mem_ctx, &recmaster_nodemap);
2588 " Unable to get nodemap from recovery master %u\n",
2590 /* No election, just error */
2595 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2596 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2598 ("Recmaster node %u is inactive. Force election\n",
2601 * update our nodemap to carry the recmaster's notion of
2602 * its own flags, so that we don't keep freezing the
2603 * inactive recmaster node...
2605 nodemap->nodes[rec->recmaster].flags =
2606 recmaster_nodemap->nodes[rec->recmaster].flags;
2607 force_election(rec, pnn, nodemap);
2614 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2615 TALLOC_CTX *mem_ctx)
2618 struct ctdb_node_map_old *nodemap=NULL;
2619 struct ctdb_node_map_old **remote_nodemaps=NULL;
2620 struct ctdb_vnn_map *vnnmap=NULL;
2621 struct ctdb_vnn_map *remote_vnnmap=NULL;
2622 uint32_t num_lmasters;
2623 int32_t debug_level;
2628 /* verify that the main daemon is still running */
2629 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2630 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2634 /* ping the local daemon to tell it we are alive */
2635 ctdb_ctrl_recd_ping(ctdb);
2637 if (rec->election_timeout) {
2638 /* an election is in progress */
2642 /* read the debug level from the parent and update locally */
2643 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2645 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2648 debuglevel_set(debug_level);
2650 /* get relevant tunables */
2651 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2653 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2658 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2659 CTDB_CURRENT_NODE, &ctdb->runstate);
2661 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2665 pnn = ctdb_get_pnn(ctdb);
2668 TALLOC_FREE(rec->nodemap);
2669 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2671 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2674 nodemap = rec->nodemap;
2676 /* remember our own node flags */
2677 rec->node_flags = nodemap->nodes[pnn].flags;
2679 ban_misbehaving_nodes(rec, &self_ban);
2681 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2685 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2686 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2688 D_ERR("Failed to read recmode from local node\n");
2692 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2693 also frozen and that the recmode is set to active.
2695 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2696 /* If this node has become inactive then we want to
2697 * reduce the chances of it taking over the recovery
2698 * master role when it becomes active again. This
2699 * helps to stabilise the recovery master role so that
2700 * it stays on the most stable node.
2702 rec->priority_time = timeval_current();
2704 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2705 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2707 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2709 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2714 if (! rec->frozen_on_inactive) {
2715 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2719 (__location__ " Failed to freeze node "
2720 "in STOPPED or BANNED state\n"));
2724 rec->frozen_on_inactive = true;
2727 /* If this node is stopped or banned then it is not the recovery
2728 * master, so don't do anything. This prevents stopped or banned
2729 * node from starting election and sending unnecessary controls.
2734 rec->frozen_on_inactive = false;
2736 /* Retrieve capabilities from all connected nodes */
2737 ret = update_capabilities(rec, nodemap);
2739 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2743 if (! validate_recovery_master(rec, mem_ctx)) {
2747 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2748 /* Check if an IP takeover run is needed and trigger one if
2750 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2753 /* if we are not the recmaster then we do not need to check
2754 if recovery is needed
2756 if (pnn != rec->recmaster) {
2761 /* ensure our local copies of flags are right */
2762 ret = update_local_flags(rec, nodemap);
2764 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2768 if (ctdb->num_nodes != nodemap->num) {
2769 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2770 ctdb_load_nodes_file(ctdb);
2774 /* verify that all active nodes agree that we are the recmaster */
2775 switch (verify_recmaster(rec, nodemap, pnn)) {
2776 case MONITOR_RECOVERY_NEEDED:
2777 /* can not happen */
2779 case MONITOR_ELECTION_NEEDED:
2780 force_election(rec, pnn, nodemap);
2784 case MONITOR_FAILED:
2789 /* get the vnnmap */
2790 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2792 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2796 if (rec->need_recovery) {
2797 /* a previous recovery didn't finish */
2798 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2802 /* verify that all active nodes are in normal mode
2803 and not in recovery mode
2805 switch (verify_recmode(ctdb, nodemap)) {
2806 case MONITOR_RECOVERY_NEEDED:
2807 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2809 case MONITOR_FAILED:
2811 case MONITOR_ELECTION_NEEDED:
2812 /* can not happen */
2818 if (ctdb->recovery_lock != NULL) {
2819 /* We must already hold the recovery lock */
2820 if (!ctdb_recovery_have_lock(rec)) {
2821 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2822 ctdb_set_culprit(rec, ctdb->pnn);
2823 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2829 /* If recoveries are disabled then there is no use doing any
2830 * nodemap or flags checks. Recoveries might be disabled due
2831 * to "reloadnodes", so doing these checks might cause an
2832 * unnecessary recovery. */
2833 if (ctdb_op_is_disabled(rec->recovery)) {
2834 goto takeover_run_checks;
2837 /* get the nodemap for all active remote nodes
2839 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2840 if (remote_nodemaps == NULL) {
2841 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2844 for(i=0; i<nodemap->num; i++) {
2845 remote_nodemaps[i] = NULL;
2847 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2848 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2852 /* verify that all other nodes have the same nodemap as we have
2854 for (j=0; j<nodemap->num; j++) {
2855 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2859 if (remote_nodemaps[j] == NULL) {
2860 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2861 ctdb_set_culprit(rec, j);
2866 /* if the nodes disagree on how many nodes there are
2867 then this is a good reason to try recovery
2869 if (remote_nodemaps[j]->num != nodemap->num) {
2870 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2871 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2872 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2873 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2877 /* if the nodes disagree on which nodes exist and are
2878 active, then that is also a good reason to do recovery
2880 for (i=0;i<nodemap->num;i++) {
2881 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2882 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2883 nodemap->nodes[j].pnn, i,
2884 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2885 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2886 do_recovery(rec, mem_ctx, pnn, nodemap,
2894 * Update node flags obtained from each active node. This ensure we have
2895 * up-to-date information for all the nodes.
2897 for (j=0; j<nodemap->num; j++) {
2898 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2901 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2904 for (j=0; j<nodemap->num; j++) {
2905 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2909 /* verify the flags are consistent
2911 for (i=0; i<nodemap->num; i++) {
2912 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2916 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2917 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2918 nodemap->nodes[j].pnn,
2919 nodemap->nodes[i].pnn,
2920 remote_nodemaps[j]->nodes[i].flags,
2921 nodemap->nodes[i].flags));
2923 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2924 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2925 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2926 do_recovery(rec, mem_ctx, pnn, nodemap,
2930 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2931 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2932 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2933 do_recovery(rec, mem_ctx, pnn, nodemap,
2942 /* count how many active nodes there are */
2944 for (i=0; i<nodemap->num; i++) {
2945 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2946 if (ctdb_node_has_capabilities(rec->caps,
2947 ctdb->nodes[i]->pnn,
2948 CTDB_CAP_LMASTER)) {
2955 /* There must be the same number of lmasters in the vnn map as
2956 * there are active nodes with the lmaster capability... or
2959 if (vnnmap->size != num_lmasters) {
2960 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2961 vnnmap->size, num_lmasters));
2962 ctdb_set_culprit(rec, ctdb->pnn);
2963 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2967 /* verify that all active nodes in the nodemap also exist in
2970 for (j=0; j<nodemap->num; j++) {
2971 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2974 if (nodemap->nodes[j].pnn == pnn) {
2978 for (i=0; i<vnnmap->size; i++) {
2979 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2983 if (i == vnnmap->size) {
2984 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2985 nodemap->nodes[j].pnn));
2986 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2987 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2993 /* verify that all other nodes have the same vnnmap
2994 and are from the same generation
2996 for (j=0; j<nodemap->num; j++) {
2997 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3000 if (nodemap->nodes[j].pnn == pnn) {
3004 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3005 mem_ctx, &remote_vnnmap);
3007 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3008 nodemap->nodes[j].pnn));
3012 /* verify the vnnmap generation is the same */
3013 if (vnnmap->generation != remote_vnnmap->generation) {
3014 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3015 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3016 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3017 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3021 /* verify the vnnmap size is the same */
3022 if (vnnmap->size != remote_vnnmap->size) {
3023 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3024 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3025 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3026 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3030 /* verify the vnnmap is the same */
3031 for (i=0;i<vnnmap->size;i++) {
3032 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3033 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3034 nodemap->nodes[j].pnn));
3035 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3036 do_recovery(rec, mem_ctx, pnn, nodemap,
3043 /* FIXME: Add remote public IP checking to ensure that nodes
3044 * have the IP addresses that are allocated to them. */
3046 takeover_run_checks:
3048 /* If there are IP takeover runs requested or the previous one
3049 * failed then perform one and notify the waiters */
3050 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3051 (rec->reallocate_requests || rec->need_takeover_run)) {
3052 process_ipreallocate_requests(ctdb, rec);
3056 static void recd_sig_term_handler(struct tevent_context *ev,
3057 struct tevent_signal *se, int signum,
3058 int count, void *dont_care,
3061 struct ctdb_recoverd *rec = talloc_get_type_abort(
3062 private_data, struct ctdb_recoverd);
3064 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3065 ctdb_recovery_unlock(rec);
3071 the main monitoring loop
3073 static void monitor_cluster(struct ctdb_context *ctdb)
3075 struct tevent_signal *se;
3076 struct ctdb_recoverd *rec;
3078 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3080 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3081 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3084 rec->recmaster = CTDB_UNKNOWN_PNN;
3085 rec->recovery_lock_handle = NULL;
3087 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3088 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3090 rec->recovery = ctdb_op_init(rec, "recoveries");
3091 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3093 rec->priority_time = timeval_current();
3094 rec->frozen_on_inactive = false;
3096 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3097 recd_sig_term_handler, rec);
3099 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3103 /* register a message port for sending memory dumps */
3104 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3106 /* when a node is assigned banning credits */
3107 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3108 banning_handler, rec);
3110 /* register a message port for recovery elections */
3111 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3113 /* when nodes are disabled/enabled */
3114 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3116 /* when we are asked to puch out a flag change */
3117 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3119 /* register a message port for vacuum fetch */
3120 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3122 /* register a message port for reloadnodes */
3123 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3125 /* register a message port for performing a takeover run */
3126 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3128 /* register a message port for disabling the ip check for a short while */
3129 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3131 /* register a message port for forcing a rebalance of a node next
3133 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3135 /* Register a message port for disabling takeover runs */
3136 ctdb_client_set_message_handler(ctdb,
3137 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3138 disable_takeover_runs_handler, rec);
3140 /* Register a message port for disabling recoveries */
3141 ctdb_client_set_message_handler(ctdb,
3142 CTDB_SRVID_DISABLE_RECOVERIES,
3143 disable_recoveries_handler, rec);
3145 /* register a message port for detaching database */
3146 ctdb_client_set_message_handler(ctdb,
3147 CTDB_SRVID_DETACH_DATABASE,
3148 detach_database_handler, rec);
3151 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3152 struct timeval start;
3156 DEBUG(DEBUG_CRIT,(__location__
3157 " Failed to create temp context\n"));
3161 start = timeval_current();
3162 main_loop(ctdb, rec, mem_ctx);
3163 talloc_free(mem_ctx);
3165 /* we only check for recovery once every second */
3166 elapsed = timeval_elapsed(&start);
3167 if (elapsed < ctdb->tunable.recover_interval) {
3168 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3175 event handler for when the main ctdbd dies
3177 static void ctdb_recoverd_parent(struct tevent_context *ev,
3178 struct tevent_fd *fde,
3179 uint16_t flags, void *private_data)
3181 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3186 called regularly to verify that the recovery daemon is still running
3188 static void ctdb_check_recd(struct tevent_context *ev,
3189 struct tevent_timer *te,
3190 struct timeval yt, void *p)
3192 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3194 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3195 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3197 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3198 ctdb_restart_recd, ctdb);
3203 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3204 timeval_current_ofs(30, 0),
3205 ctdb_check_recd, ctdb);
3208 static void recd_sig_child_handler(struct tevent_context *ev,
3209 struct tevent_signal *se, int signum,
3210 int count, void *dont_care,
3213 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3218 pid = waitpid(-1, &status, WNOHANG);
3220 if (errno != ECHILD) {
3221 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3226 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3232 startup the recovery daemon as a child of the main ctdb daemon
3234 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3237 struct tevent_signal *se;
3238 struct tevent_fd *fde;
3241 if (pipe(fd) != 0) {
3245 ctdb->recoverd_pid = ctdb_fork(ctdb);
3246 if (ctdb->recoverd_pid == -1) {
3250 if (ctdb->recoverd_pid != 0) {
3251 talloc_free(ctdb->recd_ctx);
3252 ctdb->recd_ctx = talloc_new(ctdb);
3253 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3256 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3257 timeval_current_ofs(30, 0),
3258 ctdb_check_recd, ctdb);
3264 srandom(getpid() ^ time(NULL));
3266 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3271 prctl_set_comment("ctdb_recoverd");
3272 if (switch_from_server_to_client(ctdb) != 0) {
3273 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3277 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3279 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3280 ctdb_recoverd_parent, &fd[0]);
3281 tevent_fd_set_auto_close(fde);
3283 /* set up a handler to pick up sigchld */
3284 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3285 recd_sig_child_handler, ctdb);
3287 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3291 monitor_cluster(ctdb);
3293 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3298 shutdown the recovery daemon
3300 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3302 if (ctdb->recoverd_pid == 0) {
3306 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3307 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3309 TALLOC_FREE(ctdb->recd_ctx);
3310 TALLOC_FREE(ctdb->recd_ping_count);
3313 static void ctdb_restart_recd(struct tevent_context *ev,
3314 struct tevent_timer *te,
3315 struct timeval t, void *private_data)
3317 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3319 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3320 ctdb_stop_recoverd(ctdb);
3321 ctdb_start_recoverd(ctdb);