4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
89 if (*requests == NULL) {
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
116 t = talloc_zero(*requests, struct srvid_list);
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
153 state->in_progress = false;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
169 ("Unable to begin - %s are disabled\n", state->name));
173 state->in_progress = true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
213 if (state->in_progress) {
215 ("Unable to disable %s - in progress\n", state->name));
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
237 struct ctdb_banning_state {
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
423 talloc_free(tmp_ctx);
428 ensure all other nodes have attached to any databases that we have
430 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
431 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
434 struct ctdb_dbid_map_old *remote_dbmap;
436 /* verify that all other nodes have all our databases */
437 for (j=0; j<nodemap->num; j++) {
438 /* we don't need to ourself ourselves */
439 if (nodemap->nodes[j].pnn == pnn) {
442 /* don't check nodes that are unavailable */
443 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
447 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
448 mem_ctx, &remote_dbmap);
450 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
454 /* step through all local databases */
455 for (db=0; db<dbmap->num;db++) {
459 for (i=0;i<remote_dbmap->num;i++) {
460 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
464 /* the remote node already have this database */
465 if (i!=remote_dbmap->num) {
468 /* ok so we need to create this database */
469 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
470 dbmap->dbs[db].db_id, mem_ctx,
473 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
476 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
477 nodemap->nodes[j].pnn,
479 dbmap->dbs[db].flags, NULL);
481 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
492 ensure we are attached to any databases that anyone else is attached to
494 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
495 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
498 struct ctdb_dbid_map_old *remote_dbmap;
500 /* verify that we have all database any other node has */
501 for (j=0; j<nodemap->num; j++) {
502 /* we don't need to ourself ourselves */
503 if (nodemap->nodes[j].pnn == pnn) {
506 /* don't check nodes that are unavailable */
507 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
511 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
512 mem_ctx, &remote_dbmap);
514 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
518 /* step through all databases on the remote node */
519 for (db=0; db<remote_dbmap->num;db++) {
522 for (i=0;i<(*dbmap)->num;i++) {
523 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
527 /* we already have this db locally */
528 if (i!=(*dbmap)->num) {
531 /* ok so we need to create this database and
534 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
535 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
537 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
538 nodemap->nodes[j].pnn));
541 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
543 remote_dbmap->dbs[db].flags, NULL);
545 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
548 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
550 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
560 update flags on all active nodes
562 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
566 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
568 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
576 called when a vacuum fetch has completed - just free it and do the next one
578 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
585 * Process one elements of the vacuum fetch list:
586 * Migrate it over to us with the special flag
587 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
589 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
591 struct ctdb_rec_data_old *r)
593 struct ctdb_client_call_state *state;
595 struct ctdb_ltdb_header *hdr;
596 struct ctdb_call call;
599 call.call_id = CTDB_NULL_FUNC;
600 call.flags = CTDB_IMMEDIATE_MIGRATION;
601 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
603 call.key.dptr = &r->data[0];
604 call.key.dsize = r->keylen;
606 /* ensure we don't block this daemon - just skip a record if we can't get
608 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
612 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
613 if (data.dptr == NULL) {
614 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
618 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
620 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
624 hdr = (struct ctdb_ltdb_header *)data.dptr;
625 if (hdr->dmaster == pnn) {
626 /* its already local */
628 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
634 state = ctdb_call_send(ctdb_db, &call);
635 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
637 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
640 state->async.fn = vacuum_fetch_callback;
641 state->async.private_data = NULL;
648 handler for vacuum fetch
650 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
653 struct ctdb_recoverd *rec = talloc_get_type(
654 private_data, struct ctdb_recoverd);
655 struct ctdb_context *ctdb = rec->ctdb;
656 struct ctdb_marshall_buffer *recs;
658 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
660 struct ctdb_dbid_map_old *dbmap=NULL;
661 uint8_t db_flags = 0;
662 struct ctdb_db_context *ctdb_db;
663 struct ctdb_rec_data_old *r;
665 recs = (struct ctdb_marshall_buffer *)data.dptr;
667 if (recs->count == 0) {
671 /* work out if the database is persistent */
672 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
674 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
678 for (i=0;i<dbmap->num;i++) {
679 if (dbmap->dbs[i].db_id == recs->db_id) {
680 db_flags = dbmap->dbs[i].flags;
684 if (i == dbmap->num) {
685 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
689 /* find the name of this database */
690 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
691 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
696 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
697 if (ctdb_db == NULL) {
698 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
702 r = (struct ctdb_rec_data_old *)&recs->data[0];
703 while (recs->count) {
706 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
711 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
716 talloc_free(tmp_ctx);
721 * handler for database detach
723 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
726 struct ctdb_recoverd *rec = talloc_get_type(
727 private_data, struct ctdb_recoverd);
728 struct ctdb_context *ctdb = rec->ctdb;
730 struct ctdb_db_context *ctdb_db;
732 if (data.dsize != sizeof(db_id)) {
735 db_id = *(uint32_t *)data.dptr;
737 ctdb_db = find_ctdb_db(ctdb, db_id);
738 if (ctdb_db == NULL) {
739 /* database is not attached */
743 DLIST_REMOVE(ctdb->db_list, ctdb_db);
745 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
747 talloc_free(ctdb_db);
751 called when ctdb_wait_timeout should finish
753 static void ctdb_wait_handler(struct tevent_context *ev,
754 struct tevent_timer *te,
755 struct timeval yt, void *p)
757 uint32_t *timed_out = (uint32_t *)p;
762 wait for a given number of seconds
764 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
766 uint32_t timed_out = 0;
767 time_t usecs = (secs - (time_t)secs) * 1000000;
768 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
769 ctdb_wait_handler, &timed_out);
771 tevent_loop_once(ctdb->ev);
776 called when an election times out (ends)
778 static void ctdb_election_timeout(struct tevent_context *ev,
779 struct tevent_timer *te,
780 struct timeval t, void *p)
782 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
783 rec->election_timeout = NULL;
786 DEBUG(DEBUG_WARNING,("Election period ended\n"));
791 wait for an election to finish. It finished election_timeout seconds after
792 the last election packet is received
794 static void ctdb_wait_election(struct ctdb_recoverd *rec)
796 struct ctdb_context *ctdb = rec->ctdb;
797 while (rec->election_timeout) {
798 tevent_loop_once(ctdb->ev);
803 Update our local flags from all remote connected nodes.
804 This is only run when we are or we belive we are the recovery master
806 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
809 struct ctdb_context *ctdb = rec->ctdb;
810 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
812 /* get the nodemap for all active remote nodes and verify
813 they are the same as for this node
815 for (j=0; j<nodemap->num; j++) {
816 struct ctdb_node_map_old *remote_nodemap=NULL;
819 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
822 if (nodemap->nodes[j].pnn == ctdb->pnn) {
826 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
827 mem_ctx, &remote_nodemap);
829 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
830 nodemap->nodes[j].pnn));
831 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
832 talloc_free(mem_ctx);
835 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
836 /* We should tell our daemon about this so it
837 updates its flags or else we will log the same
838 message again in the next iteration of recovery.
839 Since we are the recovery master we can just as
840 well update the flags on all nodes.
842 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
844 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
848 /* Update our local copy of the flags in the recovery
851 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
852 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
853 nodemap->nodes[j].flags));
854 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
856 talloc_free(remote_nodemap);
858 talloc_free(mem_ctx);
863 /* Create a new random generation id.
864 The generation id can not be the INVALID_GENERATION id
866 static uint32_t new_generation(void)
871 generation = random();
873 if (generation != INVALID_GENERATION) {
881 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
883 return (rec->recovery_lock_handle != NULL);
886 struct ctdb_recovery_lock_handle {
890 struct ctdb_cluster_mutex_handle *h;
893 static void take_reclock_handler(char status,
897 struct ctdb_recovery_lock_handle *s =
898 (struct ctdb_recovery_lock_handle *) private_data;
902 s->latency = latency;
907 ("Unable to take recovery lock - contention\n"));
911 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
915 s->locked = (status == '0') ;
918 static void force_election(struct ctdb_recoverd *rec,
920 struct ctdb_node_map_old *nodemap);
922 static void lost_reclock_handler(void *private_data)
924 struct ctdb_recoverd *rec = talloc_get_type_abort(
925 private_data, struct ctdb_recoverd);
927 D_ERR("Recovery lock helper terminated, triggering an election\n");
928 TALLOC_FREE(rec->recovery_lock_handle);
930 force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
933 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
935 struct ctdb_context *ctdb = rec->ctdb;
936 struct ctdb_cluster_mutex_handle *h;
937 struct ctdb_recovery_lock_handle *s;
939 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
941 DBG_ERR("Memory allocation error\n");
945 h = ctdb_cluster_mutex(s,
949 take_reclock_handler,
951 lost_reclock_handler,
958 rec->recovery_lock_handle = s;
962 tevent_loop_once(ctdb->ev);
966 TALLOC_FREE(rec->recovery_lock_handle);
970 ctdb_ctrl_report_recd_lock_latency(ctdb,
977 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
979 if (rec->recovery_lock_handle == NULL) {
983 if (! rec->recovery_lock_handle->done) {
985 * Taking of recovery lock still in progress. Free
986 * the cluster mutex handle to release it but leave
987 * the recovery lock handle in place to allow taking
988 * of the lock to fail.
990 D_NOTICE("Cancelling recovery lock\n");
991 TALLOC_FREE(rec->recovery_lock_handle->h);
992 rec->recovery_lock_handle->done = true;
993 rec->recovery_lock_handle->locked = false;
997 D_NOTICE("Releasing recovery lock\n");
998 TALLOC_FREE(rec->recovery_lock_handle);
1001 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1003 struct ctdb_context *ctdb = rec->ctdb;
1005 struct ctdb_banning_state *ban_state;
1008 for (i=0; i<ctdb->num_nodes; i++) {
1009 if (ctdb->nodes[i]->ban_state == NULL) {
1012 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1013 if (ban_state->count < 2*ctdb->num_nodes) {
1017 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1018 ctdb->nodes[i]->pnn, ban_state->count,
1019 ctdb->tunable.recovery_ban_period));
1020 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1021 ban_state->count = 0;
1023 /* Banning ourself? */
1024 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1030 struct helper_state {
1037 static void helper_handler(struct tevent_context *ev,
1038 struct tevent_fd *fde,
1039 uint16_t flags, void *private_data)
1041 struct helper_state *state = talloc_get_type_abort(
1042 private_data, struct helper_state);
1045 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1046 if (ret != sizeof(state->result)) {
1047 state->result = EPIPE;
1053 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1054 const char *prog, const char *arg, const char *type)
1056 struct helper_state *state;
1057 struct tevent_fd *fde;
1060 uint32_t recmaster = rec->recmaster;
1062 state = talloc_zero(mem_ctx, struct helper_state);
1063 if (state == NULL) {
1064 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1070 ret = pipe(state->fd);
1073 ("Failed to create pipe for %s helper\n", type));
1077 set_close_on_exec(state->fd[0]);
1080 args = talloc_array(state, const char *, nargs);
1082 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1086 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1087 if (args[0] == NULL) {
1088 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1091 args[1] = rec->ctdb->daemon.name;
1095 if (args[2] == NULL) {
1099 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1100 if (state->pid == -1) {
1102 ("Failed to create child for %s helper\n", type));
1106 close(state->fd[1]);
1109 state->done = false;
1111 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1112 TEVENT_FD_READ, helper_handler, state);
1116 tevent_fd_set_auto_close(fde);
1118 while (!state->done) {
1119 tevent_loop_once(rec->ctdb->ev);
1121 /* If recmaster changes, we have lost election */
1122 if (recmaster != rec->recmaster) {
1123 D_ERR("Recmaster changed to %u, aborting %s\n",
1124 rec->recmaster, type);
1130 close(state->fd[0]);
1133 if (state->result != 0) {
1137 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1142 if (state->fd[0] != -1) {
1143 close(state->fd[0]);
1145 if (state->fd[1] != -1) {
1146 close(state->fd[1]);
1148 if (state->pid != -1) {
1149 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1156 static int ctdb_takeover(struct ctdb_recoverd *rec,
1157 uint32_t *force_rebalance_nodes)
1159 static char prog[PATH_MAX+1] = "";
1163 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1164 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1165 "ctdb_takeover_helper")) {
1166 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1170 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1171 uint32_t pnn = force_rebalance_nodes[i];
1173 arg = talloc_asprintf(rec, "%u", pnn);
1175 arg = talloc_asprintf_append(arg, ",%u", pnn);
1178 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1183 if (ctdb_config.failover_disabled) {
1184 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1186 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1191 return helper_run(rec, rec, prog, arg, "takeover");
1194 static bool do_takeover_run(struct ctdb_recoverd *rec,
1195 struct ctdb_node_map_old *nodemap)
1197 uint32_t *nodes = NULL;
1198 struct ctdb_disable_message dtr;
1201 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1205 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1207 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1208 DEBUG(DEBUG_ERR, (__location__
1209 " takeover run already in progress \n"));
1214 if (!ctdb_op_begin(rec->takeover_run)) {
1219 /* Disable IP checks (takeover runs, really) on other nodes
1220 * while doing this takeover run. This will stop those other
1221 * nodes from triggering takeover runs when think they should
1222 * be hosting an IP but it isn't yet on an interface. Don't
1223 * wait for replies since a failure here might cause some
1224 * noise in the logs but will not actually cause a problem.
1227 dtr.srvid = 0; /* No reply */
1230 data.dptr = (uint8_t*)&dtr;
1231 data.dsize = sizeof(dtr);
1233 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1235 /* Disable for 60 seconds. This can be a tunable later if
1239 for (i = 0; i < talloc_array_length(nodes); i++) {
1240 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1241 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1243 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1247 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1249 /* Reenable takeover runs and IP checks on other nodes */
1251 for (i = 0; i < talloc_array_length(nodes); i++) {
1252 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1253 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1255 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1260 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1266 /* Takeover run was successful so clear force rebalance targets */
1267 if (rebalance_nodes == rec->force_rebalance_nodes) {
1268 TALLOC_FREE(rec->force_rebalance_nodes);
1270 DEBUG(DEBUG_WARNING,
1271 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1274 rec->need_takeover_run = !ok;
1276 ctdb_op_end(rec->takeover_run);
1278 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1282 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1284 static char prog[PATH_MAX+1] = "";
1287 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1288 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1289 "ctdb_recovery_helper")) {
1290 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1293 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1295 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1299 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1301 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1305 we are the recmaster, and recovery is needed - start a recovery run
1307 static int do_recovery(struct ctdb_recoverd *rec,
1308 TALLOC_CTX *mem_ctx, uint32_t pnn,
1309 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1311 struct ctdb_context *ctdb = rec->ctdb;
1313 struct ctdb_dbid_map_old *dbmap;
1316 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1318 /* Check if the current node is still the recmaster. It's possible that
1319 * re-election has changed the recmaster.
1321 if (pnn != rec->recmaster) {
1323 ("Recovery master changed to %u, aborting recovery\n",
1328 /* if recovery fails, force it again */
1329 rec->need_recovery = true;
1331 if (!ctdb_op_begin(rec->recovery)) {
1335 if (rec->election_timeout) {
1336 /* an election is in progress */
1337 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1341 ban_misbehaving_nodes(rec, &self_ban);
1343 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1347 if (ctdb->recovery_lock != NULL) {
1348 if (ctdb_recovery_have_lock(rec)) {
1349 D_NOTICE("Already holding recovery lock\n");
1353 D_NOTICE("Attempting to take recovery lock (%s)\n",
1354 ctdb->recovery_lock);
1356 ok = ctdb_recovery_lock(rec);
1358 D_ERR("Unable to take recovery lock\n");
1360 if (pnn != rec->recmaster) {
1361 D_NOTICE("Recovery master changed to %u,"
1362 " aborting recovery\n",
1364 rec->need_recovery = false;
1368 if (ctdb->runstate ==
1369 CTDB_RUNSTATE_FIRST_RECOVERY) {
1371 * First recovery? Perhaps
1372 * current node does not yet
1373 * know who the recmaster is.
1375 D_ERR("Retrying recovery\n");
1379 D_ERR("Abort recovery, "
1380 "ban this node for %u seconds\n",
1381 ctdb->tunable.recovery_ban_period);
1384 ctdb->tunable.recovery_ban_period);
1387 D_NOTICE("Recovery lock taken successfully\n");
1391 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1393 /* get a list of all databases */
1394 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1396 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1400 /* we do the db creation before we set the recovery mode, so the freeze happens
1401 on all databases we will be dealing with. */
1403 /* verify that we have all the databases any other node has */
1404 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1406 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1410 /* verify that all other nodes have all our databases */
1411 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1413 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1416 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1419 /* Retrieve capabilities from all connected nodes */
1420 ret = update_capabilities(rec, nodemap);
1422 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1427 update all nodes to have the same flags that we have
1429 for (i=0;i<nodemap->num;i++) {
1430 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1434 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1436 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1437 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1439 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1445 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1447 ret = db_recovery_parallel(rec, mem_ctx);
1452 do_takeover_run(rec, nodemap);
1454 /* send a message to all clients telling them that the cluster
1455 has been reconfigured */
1456 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1457 CTDB_SRVID_RECONFIGURE, tdb_null);
1459 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1463 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1465 rec->need_recovery = false;
1466 ctdb_op_end(rec->recovery);
1468 /* we managed to complete a full recovery, make sure to forgive
1469 any past sins by the nodes that could now participate in the
1472 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1473 for (i=0;i<nodemap->num;i++) {
1474 struct ctdb_banning_state *ban_state;
1476 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1480 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1481 if (ban_state == NULL) {
1485 ban_state->count = 0;
1488 /* We just finished a recovery successfully.
1489 We now wait for rerecovery_timeout before we allow
1490 another recovery to take place.
1492 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1493 ctdb_op_disable(rec->recovery, ctdb->ev,
1494 ctdb->tunable.rerecovery_timeout);
1498 ctdb_op_end(rec->recovery);
1504 elections are won by first checking the number of connected nodes, then
1505 the priority time, then the pnn
1507 struct election_message {
1508 uint32_t num_connected;
1509 struct timeval priority_time;
1511 uint32_t node_flags;
1515 form this nodes election data
1517 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1520 struct ctdb_node_map_old *nodemap;
1521 struct ctdb_context *ctdb = rec->ctdb;
1525 em->pnn = rec->ctdb->pnn;
1526 em->priority_time = rec->priority_time;
1528 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1530 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1534 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1535 em->node_flags = rec->node_flags;
1537 for (i=0;i<nodemap->num;i++) {
1538 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1539 em->num_connected++;
1543 /* we shouldnt try to win this election if we cant be a recmaster */
1544 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1545 em->num_connected = 0;
1546 em->priority_time = timeval_current();
1549 talloc_free(nodemap);
1553 see if the given election data wins
1555 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1557 struct election_message myem;
1560 ctdb_election_data(rec, &myem);
1562 /* we cant win if we don't have the recmaster capability */
1563 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1567 /* we cant win if we are banned */
1568 if (rec->node_flags & NODE_FLAGS_BANNED) {
1572 /* we cant win if we are stopped */
1573 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1577 /* we will automatically win if the other node is banned */
1578 if (em->node_flags & NODE_FLAGS_BANNED) {
1582 /* we will automatically win if the other node is banned */
1583 if (em->node_flags & NODE_FLAGS_STOPPED) {
1587 /* then the longest running node */
1589 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1593 cmp = (int)myem.pnn - (int)em->pnn;
1600 send out an election request
1602 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1605 TDB_DATA election_data;
1606 struct election_message emsg;
1608 struct ctdb_context *ctdb = rec->ctdb;
1610 srvid = CTDB_SRVID_ELECTION;
1612 ctdb_election_data(rec, &emsg);
1614 election_data.dsize = sizeof(struct election_message);
1615 election_data.dptr = (unsigned char *)&emsg;
1618 /* first we assume we will win the election and set
1619 recoverymaster to be ourself on the current node
1621 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1622 CTDB_CURRENT_NODE, pnn);
1624 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1627 rec->recmaster = pnn;
1629 /* send an election message to all active nodes */
1630 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1631 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1635 we think we are winning the election - send a broadcast election request
1637 static void election_send_request(struct tevent_context *ev,
1638 struct tevent_timer *te,
1639 struct timeval t, void *p)
1641 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1644 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1646 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1649 TALLOC_FREE(rec->send_election_te);
1653 handler for memory dumps
1655 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1657 struct ctdb_recoverd *rec = talloc_get_type(
1658 private_data, struct ctdb_recoverd);
1659 struct ctdb_context *ctdb = rec->ctdb;
1660 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1663 struct ctdb_srvid_message *rd;
1665 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1666 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1667 talloc_free(tmp_ctx);
1670 rd = (struct ctdb_srvid_message *)data.dptr;
1672 dump = talloc_zero(tmp_ctx, TDB_DATA);
1674 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1675 talloc_free(tmp_ctx);
1678 ret = ctdb_dump_memory(ctdb, dump);
1680 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1681 talloc_free(tmp_ctx);
1685 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1687 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1689 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1690 talloc_free(tmp_ctx);
1694 talloc_free(tmp_ctx);
1698 handler for reload_nodes
1700 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1703 struct ctdb_recoverd *rec = talloc_get_type(
1704 private_data, struct ctdb_recoverd);
1706 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1708 ctdb_load_nodes_file(rec->ctdb);
1712 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1715 struct ctdb_recoverd *rec = talloc_get_type(
1716 private_data, struct ctdb_recoverd);
1717 struct ctdb_context *ctdb = rec->ctdb;
1722 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1726 if (data.dsize != sizeof(uint32_t)) {
1727 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1731 pnn = *(uint32_t *)&data.dptr[0];
1733 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1735 /* Copy any existing list of nodes. There's probably some
1736 * sort of realloc variant that will do this but we need to
1737 * make sure that freeing the old array also cancels the timer
1738 * event for the timeout... not sure if realloc will do that.
1740 len = (rec->force_rebalance_nodes != NULL) ?
1741 talloc_array_length(rec->force_rebalance_nodes) :
1744 /* This allows duplicates to be added but they don't cause
1745 * harm. A call to add a duplicate PNN arguably means that
1746 * the timeout should be reset, so this is the simplest
1749 t = talloc_zero_array(rec, uint32_t, len+1);
1750 CTDB_NO_MEMORY_VOID(ctdb, t);
1752 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1756 talloc_free(rec->force_rebalance_nodes);
1758 rec->force_rebalance_nodes = t;
1763 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1765 struct ctdb_op_state *op_state)
1767 struct ctdb_disable_message *r;
1772 /* Validate input data */
1773 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1774 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1775 "expecting %lu\n", (long unsigned)data.dsize,
1776 (long unsigned)sizeof(struct ctdb_srvid_message)));
1779 if (data.dptr == NULL) {
1780 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1784 r = (struct ctdb_disable_message *)data.dptr;
1785 timeout = r->timeout;
1787 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1792 /* Returning our PNN tells the caller that we succeeded */
1793 ret = ctdb_get_pnn(ctdb);
1795 result.dsize = sizeof(int32_t);
1796 result.dptr = (uint8_t *)&ret;
1797 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1800 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1803 struct ctdb_recoverd *rec = talloc_get_type(
1804 private_data, struct ctdb_recoverd);
1806 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1809 /* Backward compatibility for this SRVID */
1810 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1813 struct ctdb_recoverd *rec = talloc_get_type(
1814 private_data, struct ctdb_recoverd);
1817 if (data.dsize != sizeof(uint32_t)) {
1818 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1819 "expecting %lu\n", (long unsigned)data.dsize,
1820 (long unsigned)sizeof(uint32_t)));
1823 if (data.dptr == NULL) {
1824 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1828 timeout = *((uint32_t *)data.dptr);
1830 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1833 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1836 struct ctdb_recoverd *rec = talloc_get_type(
1837 private_data, struct ctdb_recoverd);
1839 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1843 handler for ip reallocate, just add it to the list of requests and
1844 handle this later in the monitor_cluster loop so we do not recurse
1845 with other requests to takeover_run()
1847 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1850 struct ctdb_srvid_message *request;
1851 struct ctdb_recoverd *rec = talloc_get_type(
1852 private_data, struct ctdb_recoverd);
1854 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1855 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1859 request = (struct ctdb_srvid_message *)data.dptr;
1861 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1864 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1865 struct ctdb_recoverd *rec)
1869 struct srvid_requests *current;
1871 /* Only process requests that are currently pending. More
1872 * might come in while the takeover run is in progress and
1873 * they will need to be processed later since they might
1874 * be in response flag changes.
1876 current = rec->reallocate_requests;
1877 rec->reallocate_requests = NULL;
1879 if (do_takeover_run(rec, rec->nodemap)) {
1880 ret = ctdb_get_pnn(ctdb);
1885 result.dsize = sizeof(int32_t);
1886 result.dptr = (uint8_t *)&ret;
1888 srvid_requests_reply(ctdb, ¤t, result);
1892 * handler for assigning banning credits
1894 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1896 struct ctdb_recoverd *rec = talloc_get_type(
1897 private_data, struct ctdb_recoverd);
1900 /* Ignore if we are not recmaster */
1901 if (rec->ctdb->pnn != rec->recmaster) {
1905 if (data.dsize != sizeof(uint32_t)) {
1906 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1911 ban_pnn = *(uint32_t *)data.dptr;
1913 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1917 handler for recovery master elections
1919 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1921 struct ctdb_recoverd *rec = talloc_get_type(
1922 private_data, struct ctdb_recoverd);
1923 struct ctdb_context *ctdb = rec->ctdb;
1925 struct election_message *em = (struct election_message *)data.dptr;
1927 /* Ignore election packets from ourself */
1928 if (ctdb->pnn == em->pnn) {
1932 /* we got an election packet - update the timeout for the election */
1933 talloc_free(rec->election_timeout);
1934 rec->election_timeout = tevent_add_timer(
1937 timeval_current_ofs(0, 500000) :
1938 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1939 ctdb_election_timeout, rec);
1941 /* someone called an election. check their election data
1942 and if we disagree and we would rather be the elected node,
1943 send a new election message to all other nodes
1945 if (ctdb_election_win(rec, em)) {
1946 if (!rec->send_election_te) {
1947 rec->send_election_te = tevent_add_timer(
1949 timeval_current_ofs(0, 500000),
1950 election_send_request, rec);
1956 TALLOC_FREE(rec->send_election_te);
1958 /* Release the recovery lock file */
1959 if (ctdb_recovery_have_lock(rec)) {
1960 ctdb_recovery_unlock(rec);
1963 /* ok, let that guy become recmaster then */
1964 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1965 CTDB_CURRENT_NODE, em->pnn);
1967 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1970 rec->recmaster = em->pnn;
1977 force the start of the election process
1979 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1980 struct ctdb_node_map_old *nodemap)
1983 struct ctdb_context *ctdb = rec->ctdb;
1985 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1987 /* set all nodes to recovery mode to stop all internode traffic */
1988 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1990 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1994 talloc_free(rec->election_timeout);
1995 rec->election_timeout = tevent_add_timer(
1998 timeval_current_ofs(0, 500000) :
1999 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2000 ctdb_election_timeout, rec);
2002 ret = send_election_request(rec, pnn);
2004 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2008 /* wait for a few seconds to collect all responses */
2009 ctdb_wait_election(rec);
2015 handler for when a node changes its flags
2017 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2019 struct ctdb_recoverd *rec = talloc_get_type(
2020 private_data, struct ctdb_recoverd);
2021 struct ctdb_context *ctdb = rec->ctdb;
2023 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2024 struct ctdb_node_map_old *nodemap=NULL;
2025 TALLOC_CTX *tmp_ctx;
2028 if (data.dsize != sizeof(*c)) {
2029 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2033 tmp_ctx = talloc_new(ctdb);
2034 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2036 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2038 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2039 talloc_free(tmp_ctx);
2044 for (i=0;i<nodemap->num;i++) {
2045 if (nodemap->nodes[i].pnn == c->pnn) break;
2048 if (i == nodemap->num) {
2049 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2050 talloc_free(tmp_ctx);
2054 if (c->old_flags != c->new_flags) {
2055 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2058 nodemap->nodes[i].flags = c->new_flags;
2060 talloc_free(tmp_ctx);
2064 handler for when we need to push out flag changes ot all other nodes
2066 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2069 struct ctdb_recoverd *rec = talloc_get_type(
2070 private_data, struct ctdb_recoverd);
2071 struct ctdb_context *ctdb = rec->ctdb;
2073 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2074 struct ctdb_node_map_old *nodemap=NULL;
2075 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2078 /* read the node flags from the recmaster */
2079 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2082 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2083 talloc_free(tmp_ctx);
2086 if (c->pnn >= nodemap->num) {
2087 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2088 talloc_free(tmp_ctx);
2092 /* send the flags update to all connected nodes */
2093 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2095 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2096 nodes, 0, CONTROL_TIMEOUT(),
2100 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2102 talloc_free(tmp_ctx);
2106 talloc_free(tmp_ctx);
2110 struct verify_recmode_normal_data {
2112 enum monitor_result status;
2115 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2117 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2120 /* one more node has responded with recmode data*/
2123 /* if we failed to get the recmode, then return an error and let
2124 the main loop try again.
2126 if (state->state != CTDB_CONTROL_DONE) {
2127 if (rmdata->status == MONITOR_OK) {
2128 rmdata->status = MONITOR_FAILED;
2133 /* if we got a response, then the recmode will be stored in the
2136 if (state->status != CTDB_RECOVERY_NORMAL) {
2137 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2138 rmdata->status = MONITOR_RECOVERY_NEEDED;
2145 /* verify that all nodes are in normal recovery mode */
2146 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2148 struct verify_recmode_normal_data *rmdata;
2149 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2150 struct ctdb_client_control_state *state;
2151 enum monitor_result status;
2154 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2155 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2157 rmdata->status = MONITOR_OK;
2159 /* loop over all active nodes and send an async getrecmode call to
2161 for (j=0; j<nodemap->num; j++) {
2162 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2165 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2167 nodemap->nodes[j].pnn);
2168 if (state == NULL) {
2169 /* we failed to send the control, treat this as
2170 an error and try again next iteration
2172 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2173 talloc_free(mem_ctx);
2174 return MONITOR_FAILED;
2177 /* set up the callback functions */
2178 state->async.fn = verify_recmode_normal_callback;
2179 state->async.private_data = rmdata;
2181 /* one more control to wait for to complete */
2186 /* now wait for up to the maximum number of seconds allowed
2187 or until all nodes we expect a response from has replied
2189 while (rmdata->count > 0) {
2190 tevent_loop_once(ctdb->ev);
2193 status = rmdata->status;
2194 talloc_free(mem_ctx);
2199 struct verify_recmaster_data {
2200 struct ctdb_recoverd *rec;
2203 enum monitor_result status;
2206 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2208 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2211 /* one more node has responded with recmaster data*/
2214 /* if we failed to get the recmaster, then return an error and let
2215 the main loop try again.
2217 if (state->state != CTDB_CONTROL_DONE) {
2218 if (rmdata->status == MONITOR_OK) {
2219 rmdata->status = MONITOR_FAILED;
2224 /* if we got a response, then the recmaster will be stored in the
2227 if (state->status != rmdata->pnn) {
2228 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2229 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2230 rmdata->status = MONITOR_ELECTION_NEEDED;
2237 /* verify that all nodes agree that we are the recmaster */
2238 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2240 struct ctdb_context *ctdb = rec->ctdb;
2241 struct verify_recmaster_data *rmdata;
2242 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2243 struct ctdb_client_control_state *state;
2244 enum monitor_result status;
2247 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2248 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2252 rmdata->status = MONITOR_OK;
2254 /* loop over all active nodes and send an async getrecmaster call to
2256 for (j=0; j<nodemap->num; j++) {
2257 if (nodemap->nodes[j].pnn == rec->recmaster) {
2260 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2263 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2265 nodemap->nodes[j].pnn);
2266 if (state == NULL) {
2267 /* we failed to send the control, treat this as
2268 an error and try again next iteration
2270 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2271 talloc_free(mem_ctx);
2272 return MONITOR_FAILED;
2275 /* set up the callback functions */
2276 state->async.fn = verify_recmaster_callback;
2277 state->async.private_data = rmdata;
2279 /* one more control to wait for to complete */
2284 /* now wait for up to the maximum number of seconds allowed
2285 or until all nodes we expect a response from has replied
2287 while (rmdata->count > 0) {
2288 tevent_loop_once(ctdb->ev);
2291 status = rmdata->status;
2292 talloc_free(mem_ctx);
2296 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2297 struct ctdb_recoverd *rec)
2299 struct ctdb_iface_list_old *ifaces = NULL;
2300 TALLOC_CTX *mem_ctx;
2303 mem_ctx = talloc_new(NULL);
2305 /* Read the interfaces from the local node */
2306 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2307 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2308 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2309 /* We could return an error. However, this will be
2310 * rare so we'll decide that the interfaces have
2311 * actually changed, just in case.
2313 talloc_free(mem_ctx);
2318 /* We haven't been here before so things have changed */
2319 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2321 } else if (rec->ifaces->num != ifaces->num) {
2322 /* Number of interfaces has changed */
2323 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2324 rec->ifaces->num, ifaces->num));
2327 /* See if interface names or link states have changed */
2329 for (i = 0; i < rec->ifaces->num; i++) {
2330 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2331 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2333 ("Interface in slot %d changed: %s => %s\n",
2334 i, iface->name, ifaces->ifaces[i].name));
2338 if (iface->link_state != ifaces->ifaces[i].link_state) {
2340 ("Interface %s changed state: %d => %d\n",
2341 iface->name, iface->link_state,
2342 ifaces->ifaces[i].link_state));
2349 talloc_free(rec->ifaces);
2350 rec->ifaces = talloc_steal(rec, ifaces);
2352 talloc_free(mem_ctx);
2356 /* Check that the local allocation of public IP addresses is correct
2357 * and do some house-keeping */
2358 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2359 struct ctdb_recoverd *rec,
2361 struct ctdb_node_map_old *nodemap)
2363 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2365 bool need_takeover_run = false;
2366 struct ctdb_public_ip_list_old *ips = NULL;
2368 /* If we are not the recmaster then do some housekeeping */
2369 if (rec->recmaster != pnn) {
2370 /* Ignore any IP reallocate requests - only recmaster
2373 TALLOC_FREE(rec->reallocate_requests);
2374 /* Clear any nodes that should be force rebalanced in
2375 * the next takeover run. If the recovery master role
2376 * has moved then we don't want to process these some
2377 * time in the future.
2379 TALLOC_FREE(rec->force_rebalance_nodes);
2382 /* Return early if disabled... */
2383 if (ctdb_config.failover_disabled ||
2384 ctdb_op_is_disabled(rec->takeover_run)) {
2388 if (interfaces_have_changed(ctdb, rec)) {
2389 need_takeover_run = true;
2392 /* If there are unhosted IPs but this node can host them then
2393 * trigger an IP reallocation */
2395 /* Read *available* IPs from local node */
2396 ret = ctdb_ctrl_get_public_ips_flags(
2397 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2398 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2400 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2401 talloc_free(mem_ctx);
2405 for (j=0; j<ips->num; j++) {
2406 if (ips->ips[j].pnn == -1 &&
2407 nodemap->nodes[pnn].flags == 0) {
2408 DEBUG(DEBUG_WARNING,
2409 ("Unassigned IP %s can be served by this node\n",
2410 ctdb_addr_to_str(&ips->ips[j].addr)));
2411 need_takeover_run = true;
2417 if (!ctdb->do_checkpublicip) {
2421 /* Validate the IP addresses that this node has on network
2422 * interfaces. If there is an inconsistency between reality
2423 * and the state expected by CTDB then try to fix it by
2424 * triggering an IP reallocation or releasing extraneous IP
2427 /* Read *known* IPs from local node */
2428 ret = ctdb_ctrl_get_public_ips_flags(
2429 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2431 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2432 talloc_free(mem_ctx);
2436 for (j=0; j<ips->num; j++) {
2437 if (ips->ips[j].pnn == pnn) {
2438 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2440 ("Assigned IP %s not on an interface\n",
2441 ctdb_addr_to_str(&ips->ips[j].addr)));
2442 need_takeover_run = true;
2445 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2447 ("IP %s incorrectly on an interface\n",
2448 ctdb_addr_to_str(&ips->ips[j].addr)));
2449 need_takeover_run = true;
2455 if (need_takeover_run) {
2456 struct ctdb_srvid_message rd;
2459 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2464 data.dptr = (uint8_t *)&rd;
2465 data.dsize = sizeof(rd);
2467 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2470 ("Failed to send takeover run request\n"));
2473 talloc_free(mem_ctx);
2478 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2480 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2482 if (node_pnn >= ctdb->num_nodes) {
2483 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2487 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2491 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2492 struct ctdb_node_map_old *nodemap,
2493 struct ctdb_node_map_old **remote_nodemaps)
2497 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2498 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2500 CONTROL_TIMEOUT(), false, tdb_null,
2501 async_getnodemap_callback,
2503 remote_nodemaps) != 0) {
2504 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2512 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2513 TALLOC_CTX *mem_ctx)
2515 struct ctdb_context *ctdb = rec->ctdb;
2516 uint32_t pnn = ctdb_get_pnn(ctdb);
2517 struct ctdb_node_map_old *nodemap = rec->nodemap;
2518 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2521 /* When recovery daemon is started, recmaster is set to
2522 * "unknown" so it knows to start an election.
2524 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2526 ("Initial recovery master set - forcing election\n"));
2527 force_election(rec, pnn, nodemap);
2532 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2533 * but we have, then force an election and try to become the new
2536 if (!ctdb_node_has_capabilities(rec->caps,
2538 CTDB_CAP_RECMASTER) &&
2539 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2540 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2542 (" Current recmaster node %u does not have CAP_RECMASTER,"
2543 " but we (node %u) have - force an election\n",
2544 rec->recmaster, pnn));
2545 force_election(rec, pnn, nodemap);
2549 /* Verify that the master node has not been deleted. This
2550 * should not happen because a node should always be shutdown
2551 * before being deleted, causing a new master to be elected
2552 * before now. However, if something strange has happened
2553 * then checking here will ensure we don't index beyond the
2554 * end of the nodemap array. */
2555 if (rec->recmaster >= nodemap->num) {
2557 ("Recmaster node %u has been deleted. Force election\n",
2559 force_election(rec, pnn, nodemap);
2563 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2564 if (nodemap->nodes[rec->recmaster].flags &
2565 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2567 ("Recmaster node %u is disconnected/deleted. Force election\n",
2569 force_election(rec, pnn, nodemap);
2573 /* get nodemap from the recovery master to check if it is inactive */
2574 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2575 mem_ctx, &recmaster_nodemap);
2579 " Unable to get nodemap from recovery master %u\n",
2581 /* No election, just error */
2586 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2587 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2589 ("Recmaster node %u is inactive. Force election\n",
2592 * update our nodemap to carry the recmaster's notion of
2593 * its own flags, so that we don't keep freezing the
2594 * inactive recmaster node...
2596 nodemap->nodes[rec->recmaster].flags =
2597 recmaster_nodemap->nodes[rec->recmaster].flags;
2598 force_election(rec, pnn, nodemap);
2605 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2606 TALLOC_CTX *mem_ctx)
2609 struct ctdb_node_map_old *nodemap=NULL;
2610 struct ctdb_node_map_old **remote_nodemaps=NULL;
2611 struct ctdb_vnn_map *vnnmap=NULL;
2612 struct ctdb_vnn_map *remote_vnnmap=NULL;
2613 uint32_t num_lmasters;
2614 int32_t debug_level;
2619 /* verify that the main daemon is still running */
2620 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2621 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2625 /* ping the local daemon to tell it we are alive */
2626 ctdb_ctrl_recd_ping(ctdb);
2628 if (rec->election_timeout) {
2629 /* an election is in progress */
2633 /* read the debug level from the parent and update locally */
2634 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2636 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2639 debuglevel_set(debug_level);
2641 /* get relevant tunables */
2642 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2644 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2649 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2650 CTDB_CURRENT_NODE, &ctdb->runstate);
2652 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2656 pnn = ctdb_get_pnn(ctdb);
2659 TALLOC_FREE(rec->nodemap);
2660 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2662 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2665 nodemap = rec->nodemap;
2667 /* remember our own node flags */
2668 rec->node_flags = nodemap->nodes[pnn].flags;
2670 ban_misbehaving_nodes(rec, &self_ban);
2672 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2676 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2677 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2679 D_ERR("Failed to read recmode from local node\n");
2683 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2684 also frozen and that the recmode is set to active.
2686 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2687 /* If this node has become inactive then we want to
2688 * reduce the chances of it taking over the recovery
2689 * master role when it becomes active again. This
2690 * helps to stabilise the recovery master role so that
2691 * it stays on the most stable node.
2693 rec->priority_time = timeval_current();
2695 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2696 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2698 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2700 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2705 if (! rec->frozen_on_inactive) {
2706 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2710 (__location__ " Failed to freeze node "
2711 "in STOPPED or BANNED state\n"));
2715 rec->frozen_on_inactive = true;
2718 /* If this node is stopped or banned then it is not the recovery
2719 * master, so don't do anything. This prevents stopped or banned
2720 * node from starting election and sending unnecessary controls.
2725 rec->frozen_on_inactive = false;
2727 /* Retrieve capabilities from all connected nodes */
2728 ret = update_capabilities(rec, nodemap);
2730 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2734 if (! validate_recovery_master(rec, mem_ctx)) {
2738 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2739 /* Check if an IP takeover run is needed and trigger one if
2741 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2744 /* if we are not the recmaster then we do not need to check
2745 if recovery is needed
2747 if (pnn != rec->recmaster) {
2752 /* ensure our local copies of flags are right */
2753 ret = update_local_flags(rec, nodemap);
2755 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2759 if (ctdb->num_nodes != nodemap->num) {
2760 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2761 ctdb_load_nodes_file(ctdb);
2765 /* verify that all active nodes agree that we are the recmaster */
2766 switch (verify_recmaster(rec, nodemap, pnn)) {
2767 case MONITOR_RECOVERY_NEEDED:
2768 /* can not happen */
2770 case MONITOR_ELECTION_NEEDED:
2771 force_election(rec, pnn, nodemap);
2775 case MONITOR_FAILED:
2780 /* get the vnnmap */
2781 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2783 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2787 if (rec->need_recovery) {
2788 /* a previous recovery didn't finish */
2789 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2793 /* verify that all active nodes are in normal mode
2794 and not in recovery mode
2796 switch (verify_recmode(ctdb, nodemap)) {
2797 case MONITOR_RECOVERY_NEEDED:
2798 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2800 case MONITOR_FAILED:
2802 case MONITOR_ELECTION_NEEDED:
2803 /* can not happen */
2809 if (ctdb->recovery_lock != NULL) {
2810 /* We must already hold the recovery lock */
2811 if (!ctdb_recovery_have_lock(rec)) {
2812 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2813 ctdb_set_culprit(rec, ctdb->pnn);
2814 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2820 /* If recoveries are disabled then there is no use doing any
2821 * nodemap or flags checks. Recoveries might be disabled due
2822 * to "reloadnodes", so doing these checks might cause an
2823 * unnecessary recovery. */
2824 if (ctdb_op_is_disabled(rec->recovery)) {
2825 goto takeover_run_checks;
2828 /* get the nodemap for all active remote nodes
2830 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2831 if (remote_nodemaps == NULL) {
2832 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2835 for(i=0; i<nodemap->num; i++) {
2836 remote_nodemaps[i] = NULL;
2838 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2839 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2843 /* verify that all other nodes have the same nodemap as we have
2845 for (j=0; j<nodemap->num; j++) {
2846 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2850 if (remote_nodemaps[j] == NULL) {
2851 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2852 ctdb_set_culprit(rec, j);
2857 /* if the nodes disagree on how many nodes there are
2858 then this is a good reason to try recovery
2860 if (remote_nodemaps[j]->num != nodemap->num) {
2861 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2862 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2863 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2864 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2868 /* if the nodes disagree on which nodes exist and are
2869 active, then that is also a good reason to do recovery
2871 for (i=0;i<nodemap->num;i++) {
2872 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2873 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2874 nodemap->nodes[j].pnn, i,
2875 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2876 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2877 do_recovery(rec, mem_ctx, pnn, nodemap,
2885 * Update node flags obtained from each active node. This ensure we have
2886 * up-to-date information for all the nodes.
2888 for (j=0; j<nodemap->num; j++) {
2889 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2892 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2895 for (j=0; j<nodemap->num; j++) {
2896 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2900 /* verify the flags are consistent
2902 for (i=0; i<nodemap->num; i++) {
2903 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2907 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2908 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2909 nodemap->nodes[j].pnn,
2910 nodemap->nodes[i].pnn,
2911 remote_nodemaps[j]->nodes[i].flags,
2912 nodemap->nodes[i].flags));
2914 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2915 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2916 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2917 do_recovery(rec, mem_ctx, pnn, nodemap,
2921 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2922 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2923 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2924 do_recovery(rec, mem_ctx, pnn, nodemap,
2933 /* count how many active nodes there are */
2935 for (i=0; i<nodemap->num; i++) {
2936 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2937 if (ctdb_node_has_capabilities(rec->caps,
2938 ctdb->nodes[i]->pnn,
2939 CTDB_CAP_LMASTER)) {
2946 /* There must be the same number of lmasters in the vnn map as
2947 * there are active nodes with the lmaster capability... or
2950 if (vnnmap->size != num_lmasters) {
2951 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2952 vnnmap->size, num_lmasters));
2953 ctdb_set_culprit(rec, ctdb->pnn);
2954 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2958 /* verify that all active nodes in the nodemap also exist in
2961 for (j=0; j<nodemap->num; j++) {
2962 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2965 if (nodemap->nodes[j].pnn == pnn) {
2969 for (i=0; i<vnnmap->size; i++) {
2970 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2974 if (i == vnnmap->size) {
2975 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2976 nodemap->nodes[j].pnn));
2977 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2978 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2984 /* verify that all other nodes have the same vnnmap
2985 and are from the same generation
2987 for (j=0; j<nodemap->num; j++) {
2988 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2991 if (nodemap->nodes[j].pnn == pnn) {
2995 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2996 mem_ctx, &remote_vnnmap);
2998 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2999 nodemap->nodes[j].pnn));
3003 /* verify the vnnmap generation is the same */
3004 if (vnnmap->generation != remote_vnnmap->generation) {
3005 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3006 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3007 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3008 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3012 /* verify the vnnmap size is the same */
3013 if (vnnmap->size != remote_vnnmap->size) {
3014 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3015 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3016 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3017 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3021 /* verify the vnnmap is the same */
3022 for (i=0;i<vnnmap->size;i++) {
3023 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3024 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3025 nodemap->nodes[j].pnn));
3026 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3027 do_recovery(rec, mem_ctx, pnn, nodemap,
3034 /* FIXME: Add remote public IP checking to ensure that nodes
3035 * have the IP addresses that are allocated to them. */
3037 takeover_run_checks:
3039 /* If there are IP takeover runs requested or the previous one
3040 * failed then perform one and notify the waiters */
3041 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3042 (rec->reallocate_requests || rec->need_takeover_run)) {
3043 process_ipreallocate_requests(ctdb, rec);
3047 static void recd_sig_term_handler(struct tevent_context *ev,
3048 struct tevent_signal *se, int signum,
3049 int count, void *dont_care,
3052 struct ctdb_recoverd *rec = talloc_get_type_abort(
3053 private_data, struct ctdb_recoverd);
3055 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3056 ctdb_recovery_unlock(rec);
3062 the main monitoring loop
3064 static void monitor_cluster(struct ctdb_context *ctdb)
3066 struct tevent_signal *se;
3067 struct ctdb_recoverd *rec;
3069 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3071 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3072 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3075 rec->recmaster = CTDB_UNKNOWN_PNN;
3076 rec->recovery_lock_handle = NULL;
3078 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3079 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3081 rec->recovery = ctdb_op_init(rec, "recoveries");
3082 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3084 rec->priority_time = timeval_current();
3085 rec->frozen_on_inactive = false;
3087 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3088 recd_sig_term_handler, rec);
3090 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3094 /* register a message port for sending memory dumps */
3095 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3097 /* when a node is assigned banning credits */
3098 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3099 banning_handler, rec);
3101 /* register a message port for recovery elections */
3102 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3104 /* when nodes are disabled/enabled */
3105 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3107 /* when we are asked to puch out a flag change */
3108 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3110 /* register a message port for vacuum fetch */
3111 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3113 /* register a message port for reloadnodes */
3114 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3116 /* register a message port for performing a takeover run */
3117 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3119 /* register a message port for disabling the ip check for a short while */
3120 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3122 /* register a message port for forcing a rebalance of a node next
3124 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3126 /* Register a message port for disabling takeover runs */
3127 ctdb_client_set_message_handler(ctdb,
3128 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3129 disable_takeover_runs_handler, rec);
3131 /* Register a message port for disabling recoveries */
3132 ctdb_client_set_message_handler(ctdb,
3133 CTDB_SRVID_DISABLE_RECOVERIES,
3134 disable_recoveries_handler, rec);
3136 /* register a message port for detaching database */
3137 ctdb_client_set_message_handler(ctdb,
3138 CTDB_SRVID_DETACH_DATABASE,
3139 detach_database_handler, rec);
3142 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3143 struct timeval start;
3147 DEBUG(DEBUG_CRIT,(__location__
3148 " Failed to create temp context\n"));
3152 start = timeval_current();
3153 main_loop(ctdb, rec, mem_ctx);
3154 talloc_free(mem_ctx);
3156 /* we only check for recovery once every second */
3157 elapsed = timeval_elapsed(&start);
3158 if (elapsed < ctdb->tunable.recover_interval) {
3159 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3166 event handler for when the main ctdbd dies
3168 static void ctdb_recoverd_parent(struct tevent_context *ev,
3169 struct tevent_fd *fde,
3170 uint16_t flags, void *private_data)
3172 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3177 called regularly to verify that the recovery daemon is still running
3179 static void ctdb_check_recd(struct tevent_context *ev,
3180 struct tevent_timer *te,
3181 struct timeval yt, void *p)
3183 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3185 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3186 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3188 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3189 ctdb_restart_recd, ctdb);
3194 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3195 timeval_current_ofs(30, 0),
3196 ctdb_check_recd, ctdb);
3199 static void recd_sig_child_handler(struct tevent_context *ev,
3200 struct tevent_signal *se, int signum,
3201 int count, void *dont_care,
3204 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3209 pid = waitpid(-1, &status, WNOHANG);
3211 if (errno != ECHILD) {
3212 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3217 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3223 startup the recovery daemon as a child of the main ctdb daemon
3225 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3228 struct tevent_signal *se;
3229 struct tevent_fd *fde;
3232 if (pipe(fd) != 0) {
3236 ctdb->recoverd_pid = ctdb_fork(ctdb);
3237 if (ctdb->recoverd_pid == -1) {
3241 if (ctdb->recoverd_pid != 0) {
3242 talloc_free(ctdb->recd_ctx);
3243 ctdb->recd_ctx = talloc_new(ctdb);
3244 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3247 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3248 timeval_current_ofs(30, 0),
3249 ctdb_check_recd, ctdb);
3255 srandom(getpid() ^ time(NULL));
3257 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3262 prctl_set_comment("ctdb_recoverd");
3263 if (switch_from_server_to_client(ctdb) != 0) {
3264 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3268 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3270 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3271 ctdb_recoverd_parent, &fd[0]);
3272 tevent_fd_set_auto_close(fde);
3274 /* set up a handler to pick up sigchld */
3275 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3276 recd_sig_child_handler, ctdb);
3278 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3282 monitor_cluster(ctdb);
3284 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3289 shutdown the recovery daemon
3291 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3293 if (ctdb->recoverd_pid == 0) {
3297 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3298 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3300 TALLOC_FREE(ctdb->recd_ctx);
3301 TALLOC_FREE(ctdb->recd_ping_count);
3304 static void ctdb_restart_recd(struct tevent_context *ev,
3305 struct tevent_timer *te,
3306 struct timeval t, void *private_data)
3308 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3310 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3311 ctdb_stop_recoverd(ctdb);
3312 ctdb_start_recoverd(ctdb);