4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
89 if (*requests == NULL) {
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
116 t = talloc_zero(*requests, struct srvid_list);
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
153 state->in_progress = false;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
169 ("Unable to begin - %s are disabled\n", state->name));
173 state->in_progress = true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
213 if (state->in_progress) {
215 ("Unable to disable %s - in progress\n", state->name));
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
237 struct ctdb_banning_state {
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
423 talloc_free(tmp_ctx);
428 ensure all other nodes have attached to any databases that we have
430 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
431 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
434 struct ctdb_dbid_map_old *remote_dbmap;
436 /* verify that all other nodes have all our databases */
437 for (j=0; j<nodemap->num; j++) {
438 /* we don't need to ourself ourselves */
439 if (nodemap->nodes[j].pnn == pnn) {
442 /* don't check nodes that are unavailable */
443 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
447 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
448 mem_ctx, &remote_dbmap);
450 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
454 /* step through all local databases */
455 for (db=0; db<dbmap->num;db++) {
459 for (i=0;i<remote_dbmap->num;i++) {
460 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
464 /* the remote node already have this database */
465 if (i!=remote_dbmap->num) {
468 /* ok so we need to create this database */
469 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
470 dbmap->dbs[db].db_id, mem_ctx,
473 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
476 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
477 nodemap->nodes[j].pnn,
479 dbmap->dbs[db].flags, NULL);
481 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
492 ensure we are attached to any databases that anyone else is attached to
494 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
495 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
498 struct ctdb_dbid_map_old *remote_dbmap;
500 /* verify that we have all database any other node has */
501 for (j=0; j<nodemap->num; j++) {
502 /* we don't need to ourself ourselves */
503 if (nodemap->nodes[j].pnn == pnn) {
506 /* don't check nodes that are unavailable */
507 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
511 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
512 mem_ctx, &remote_dbmap);
514 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
518 /* step through all databases on the remote node */
519 for (db=0; db<remote_dbmap->num;db++) {
522 for (i=0;i<(*dbmap)->num;i++) {
523 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
527 /* we already have this db locally */
528 if (i!=(*dbmap)->num) {
531 /* ok so we need to create this database and
534 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
535 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
537 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
538 nodemap->nodes[j].pnn));
541 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
543 remote_dbmap->dbs[db].flags, NULL);
545 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
548 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
550 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
560 update flags on all active nodes
562 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
566 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
568 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
576 called when a vacuum fetch has completed - just free it and do the next one
578 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
585 * Process one elements of the vacuum fetch list:
586 * Migrate it over to us with the special flag
587 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
589 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
591 struct ctdb_rec_data_old *r)
593 struct ctdb_client_call_state *state;
595 struct ctdb_ltdb_header *hdr;
596 struct ctdb_call call;
599 call.call_id = CTDB_NULL_FUNC;
600 call.flags = CTDB_IMMEDIATE_MIGRATION;
601 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
603 call.key.dptr = &r->data[0];
604 call.key.dsize = r->keylen;
606 /* ensure we don't block this daemon - just skip a record if we can't get
608 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
612 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
613 if (data.dptr == NULL) {
614 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
618 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
620 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
624 hdr = (struct ctdb_ltdb_header *)data.dptr;
625 if (hdr->dmaster == pnn) {
626 /* its already local */
628 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
634 state = ctdb_call_send(ctdb_db, &call);
635 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
637 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
640 state->async.fn = vacuum_fetch_callback;
641 state->async.private_data = NULL;
648 handler for vacuum fetch
650 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
653 struct ctdb_recoverd *rec = talloc_get_type(
654 private_data, struct ctdb_recoverd);
655 struct ctdb_context *ctdb = rec->ctdb;
656 struct ctdb_marshall_buffer *recs;
658 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
660 struct ctdb_dbid_map_old *dbmap=NULL;
661 uint8_t db_flags = 0;
662 struct ctdb_db_context *ctdb_db;
663 struct ctdb_rec_data_old *r;
665 recs = (struct ctdb_marshall_buffer *)data.dptr;
667 if (recs->count == 0) {
671 /* work out if the database is persistent */
672 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
674 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
678 for (i=0;i<dbmap->num;i++) {
679 if (dbmap->dbs[i].db_id == recs->db_id) {
680 db_flags = dbmap->dbs[i].flags;
684 if (i == dbmap->num) {
685 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
689 /* find the name of this database */
690 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
691 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
696 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
697 if (ctdb_db == NULL) {
698 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
702 r = (struct ctdb_rec_data_old *)&recs->data[0];
703 while (recs->count) {
706 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
711 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
716 talloc_free(tmp_ctx);
721 * handler for database detach
723 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
726 struct ctdb_recoverd *rec = talloc_get_type(
727 private_data, struct ctdb_recoverd);
728 struct ctdb_context *ctdb = rec->ctdb;
730 struct ctdb_db_context *ctdb_db;
732 if (data.dsize != sizeof(db_id)) {
735 db_id = *(uint32_t *)data.dptr;
737 ctdb_db = find_ctdb_db(ctdb, db_id);
738 if (ctdb_db == NULL) {
739 /* database is not attached */
743 DLIST_REMOVE(ctdb->db_list, ctdb_db);
745 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
747 talloc_free(ctdb_db);
751 called when ctdb_wait_timeout should finish
753 static void ctdb_wait_handler(struct tevent_context *ev,
754 struct tevent_timer *te,
755 struct timeval yt, void *p)
757 uint32_t *timed_out = (uint32_t *)p;
762 wait for a given number of seconds
764 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
766 uint32_t timed_out = 0;
767 time_t usecs = (secs - (time_t)secs) * 1000000;
768 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
769 ctdb_wait_handler, &timed_out);
771 tevent_loop_once(ctdb->ev);
776 called when an election times out (ends)
778 static void ctdb_election_timeout(struct tevent_context *ev,
779 struct tevent_timer *te,
780 struct timeval t, void *p)
782 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
783 rec->election_timeout = NULL;
786 DEBUG(DEBUG_WARNING,("Election period ended\n"));
791 wait for an election to finish. It finished election_timeout seconds after
792 the last election packet is received
794 static void ctdb_wait_election(struct ctdb_recoverd *rec)
796 struct ctdb_context *ctdb = rec->ctdb;
797 while (rec->election_timeout) {
798 tevent_loop_once(ctdb->ev);
803 Update our local flags from all remote connected nodes.
804 This is only run when we are or we belive we are the recovery master
806 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
809 struct ctdb_context *ctdb = rec->ctdb;
810 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
812 /* get the nodemap for all active remote nodes and verify
813 they are the same as for this node
815 for (j=0; j<nodemap->num; j++) {
816 struct ctdb_node_map_old *remote_nodemap=NULL;
819 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
822 if (nodemap->nodes[j].pnn == ctdb->pnn) {
826 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
827 mem_ctx, &remote_nodemap);
829 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
830 nodemap->nodes[j].pnn));
831 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
832 talloc_free(mem_ctx);
835 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
836 /* We should tell our daemon about this so it
837 updates its flags or else we will log the same
838 message again in the next iteration of recovery.
839 Since we are the recovery master we can just as
840 well update the flags on all nodes.
842 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
844 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
848 /* Update our local copy of the flags in the recovery
851 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
852 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
853 nodemap->nodes[j].flags));
854 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
856 talloc_free(remote_nodemap);
858 talloc_free(mem_ctx);
863 /* Create a new random generation id.
864 The generation id can not be the INVALID_GENERATION id
866 static uint32_t new_generation(void)
871 generation = random();
873 if (generation != INVALID_GENERATION) {
881 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
883 return (rec->recovery_lock_handle != NULL);
886 struct ctdb_recovery_lock_handle {
890 struct ctdb_cluster_mutex_handle *h;
893 static void take_reclock_handler(char status,
897 struct ctdb_recovery_lock_handle *s =
898 (struct ctdb_recovery_lock_handle *) private_data;
900 s->locked = (status == '0') ;
903 * If unsuccessful then ensure the process has exited and that
904 * the file descriptor event handler has been cancelled
912 s->latency = latency;
916 D_ERR("Unable to take recovery lock - contention\n");
920 D_ERR("Unable to take recovery lock - timeout\n");
924 D_ERR("Unable to take recover lock - unknown error\n");
930 static void force_election(struct ctdb_recoverd *rec,
932 struct ctdb_node_map_old *nodemap);
934 static void lost_reclock_handler(void *private_data)
936 struct ctdb_recoverd *rec = talloc_get_type_abort(
937 private_data, struct ctdb_recoverd);
939 D_ERR("Recovery lock helper terminated, triggering an election\n");
940 TALLOC_FREE(rec->recovery_lock_handle);
942 force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
945 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
947 struct ctdb_context *ctdb = rec->ctdb;
948 struct ctdb_cluster_mutex_handle *h;
949 struct ctdb_recovery_lock_handle *s;
951 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
953 DBG_ERR("Memory allocation error\n");
957 h = ctdb_cluster_mutex(s,
961 take_reclock_handler,
963 lost_reclock_handler,
970 rec->recovery_lock_handle = s;
974 tevent_loop_once(ctdb->ev);
978 TALLOC_FREE(rec->recovery_lock_handle);
982 ctdb_ctrl_report_recd_lock_latency(ctdb,
989 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
991 if (rec->recovery_lock_handle == NULL) {
995 if (! rec->recovery_lock_handle->done) {
997 * Taking of recovery lock still in progress. Free
998 * the cluster mutex handle to release it but leave
999 * the recovery lock handle in place to allow taking
1000 * of the lock to fail.
1002 D_NOTICE("Cancelling recovery lock\n");
1003 TALLOC_FREE(rec->recovery_lock_handle->h);
1004 rec->recovery_lock_handle->done = true;
1005 rec->recovery_lock_handle->locked = false;
1009 D_NOTICE("Releasing recovery lock\n");
1010 TALLOC_FREE(rec->recovery_lock_handle);
1013 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1015 struct ctdb_context *ctdb = rec->ctdb;
1017 struct ctdb_banning_state *ban_state;
1020 for (i=0; i<ctdb->num_nodes; i++) {
1021 if (ctdb->nodes[i]->ban_state == NULL) {
1024 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1025 if (ban_state->count < 2*ctdb->num_nodes) {
1029 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1030 ctdb->nodes[i]->pnn, ban_state->count,
1031 ctdb->tunable.recovery_ban_period));
1032 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1033 ban_state->count = 0;
1035 /* Banning ourself? */
1036 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1042 struct helper_state {
1049 static void helper_handler(struct tevent_context *ev,
1050 struct tevent_fd *fde,
1051 uint16_t flags, void *private_data)
1053 struct helper_state *state = talloc_get_type_abort(
1054 private_data, struct helper_state);
1057 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1058 if (ret != sizeof(state->result)) {
1059 state->result = EPIPE;
1065 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1066 const char *prog, const char *arg, const char *type)
1068 struct helper_state *state;
1069 struct tevent_fd *fde;
1072 uint32_t recmaster = rec->recmaster;
1074 state = talloc_zero(mem_ctx, struct helper_state);
1075 if (state == NULL) {
1076 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1082 ret = pipe(state->fd);
1085 ("Failed to create pipe for %s helper\n", type));
1089 set_close_on_exec(state->fd[0]);
1092 args = talloc_array(state, const char *, nargs);
1094 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1098 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1099 if (args[0] == NULL) {
1100 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1103 args[1] = rec->ctdb->daemon.name;
1107 if (args[2] == NULL) {
1111 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1112 if (state->pid == -1) {
1114 ("Failed to create child for %s helper\n", type));
1118 close(state->fd[1]);
1121 state->done = false;
1123 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1124 TEVENT_FD_READ, helper_handler, state);
1128 tevent_fd_set_auto_close(fde);
1130 while (!state->done) {
1131 tevent_loop_once(rec->ctdb->ev);
1133 /* If recmaster changes, we have lost election */
1134 if (recmaster != rec->recmaster) {
1135 D_ERR("Recmaster changed to %u, aborting %s\n",
1136 rec->recmaster, type);
1142 close(state->fd[0]);
1145 if (state->result != 0) {
1149 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1154 if (state->fd[0] != -1) {
1155 close(state->fd[0]);
1157 if (state->fd[1] != -1) {
1158 close(state->fd[1]);
1160 if (state->pid != -1) {
1161 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1168 static int ctdb_takeover(struct ctdb_recoverd *rec,
1169 uint32_t *force_rebalance_nodes)
1171 static char prog[PATH_MAX+1] = "";
1175 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1176 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1177 "ctdb_takeover_helper")) {
1178 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1182 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1183 uint32_t pnn = force_rebalance_nodes[i];
1185 arg = talloc_asprintf(rec, "%u", pnn);
1187 arg = talloc_asprintf_append(arg, ",%u", pnn);
1190 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1195 if (ctdb_config.failover_disabled) {
1196 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1198 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1203 return helper_run(rec, rec, prog, arg, "takeover");
1206 static bool do_takeover_run(struct ctdb_recoverd *rec,
1207 struct ctdb_node_map_old *nodemap)
1209 uint32_t *nodes = NULL;
1210 struct ctdb_disable_message dtr;
1213 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1217 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1219 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1220 DEBUG(DEBUG_ERR, (__location__
1221 " takeover run already in progress \n"));
1226 if (!ctdb_op_begin(rec->takeover_run)) {
1231 /* Disable IP checks (takeover runs, really) on other nodes
1232 * while doing this takeover run. This will stop those other
1233 * nodes from triggering takeover runs when think they should
1234 * be hosting an IP but it isn't yet on an interface. Don't
1235 * wait for replies since a failure here might cause some
1236 * noise in the logs but will not actually cause a problem.
1239 dtr.srvid = 0; /* No reply */
1242 data.dptr = (uint8_t*)&dtr;
1243 data.dsize = sizeof(dtr);
1245 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1247 /* Disable for 60 seconds. This can be a tunable later if
1251 for (i = 0; i < talloc_array_length(nodes); i++) {
1252 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1253 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1255 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1259 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1261 /* Reenable takeover runs and IP checks on other nodes */
1263 for (i = 0; i < talloc_array_length(nodes); i++) {
1264 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1265 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1267 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1272 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1278 /* Takeover run was successful so clear force rebalance targets */
1279 if (rebalance_nodes == rec->force_rebalance_nodes) {
1280 TALLOC_FREE(rec->force_rebalance_nodes);
1282 DEBUG(DEBUG_WARNING,
1283 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1286 rec->need_takeover_run = !ok;
1288 ctdb_op_end(rec->takeover_run);
1290 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1294 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1296 static char prog[PATH_MAX+1] = "";
1299 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1300 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1301 "ctdb_recovery_helper")) {
1302 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1305 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1307 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1311 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1313 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1317 we are the recmaster, and recovery is needed - start a recovery run
1319 static int do_recovery(struct ctdb_recoverd *rec,
1320 TALLOC_CTX *mem_ctx, uint32_t pnn,
1321 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1323 struct ctdb_context *ctdb = rec->ctdb;
1325 struct ctdb_dbid_map_old *dbmap;
1328 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1330 /* Check if the current node is still the recmaster. It's possible that
1331 * re-election has changed the recmaster.
1333 if (pnn != rec->recmaster) {
1335 ("Recovery master changed to %u, aborting recovery\n",
1340 /* if recovery fails, force it again */
1341 rec->need_recovery = true;
1343 if (!ctdb_op_begin(rec->recovery)) {
1347 if (rec->election_timeout) {
1348 /* an election is in progress */
1349 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1353 ban_misbehaving_nodes(rec, &self_ban);
1355 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1359 if (ctdb->recovery_lock != NULL) {
1360 if (ctdb_recovery_have_lock(rec)) {
1361 D_NOTICE("Already holding recovery lock\n");
1365 D_NOTICE("Attempting to take recovery lock (%s)\n",
1366 ctdb->recovery_lock);
1368 ok = ctdb_recovery_lock(rec);
1370 D_ERR("Unable to take recovery lock\n");
1372 if (pnn != rec->recmaster) {
1373 D_NOTICE("Recovery master changed to %u,"
1374 " aborting recovery\n",
1376 rec->need_recovery = false;
1380 if (ctdb->runstate ==
1381 CTDB_RUNSTATE_FIRST_RECOVERY) {
1383 * First recovery? Perhaps
1384 * current node does not yet
1385 * know who the recmaster is.
1387 D_ERR("Retrying recovery\n");
1391 D_ERR("Abort recovery, "
1392 "ban this node for %u seconds\n",
1393 ctdb->tunable.recovery_ban_period);
1396 ctdb->tunable.recovery_ban_period);
1399 D_NOTICE("Recovery lock taken successfully\n");
1403 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1405 /* get a list of all databases */
1406 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1408 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1412 /* we do the db creation before we set the recovery mode, so the freeze happens
1413 on all databases we will be dealing with. */
1415 /* verify that we have all the databases any other node has */
1416 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1418 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1422 /* verify that all other nodes have all our databases */
1423 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1425 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1428 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1431 /* Retrieve capabilities from all connected nodes */
1432 ret = update_capabilities(rec, nodemap);
1434 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1439 update all nodes to have the same flags that we have
1441 for (i=0;i<nodemap->num;i++) {
1442 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1446 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1448 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1449 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1451 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1457 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1459 ret = db_recovery_parallel(rec, mem_ctx);
1464 do_takeover_run(rec, nodemap);
1466 /* send a message to all clients telling them that the cluster
1467 has been reconfigured */
1468 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1469 CTDB_SRVID_RECONFIGURE, tdb_null);
1471 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1475 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1477 rec->need_recovery = false;
1478 ctdb_op_end(rec->recovery);
1480 /* we managed to complete a full recovery, make sure to forgive
1481 any past sins by the nodes that could now participate in the
1484 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1485 for (i=0;i<nodemap->num;i++) {
1486 struct ctdb_banning_state *ban_state;
1488 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1492 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1493 if (ban_state == NULL) {
1497 ban_state->count = 0;
1500 /* We just finished a recovery successfully.
1501 We now wait for rerecovery_timeout before we allow
1502 another recovery to take place.
1504 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1505 ctdb_op_disable(rec->recovery, ctdb->ev,
1506 ctdb->tunable.rerecovery_timeout);
1510 ctdb_op_end(rec->recovery);
1516 elections are won by first checking the number of connected nodes, then
1517 the priority time, then the pnn
1519 struct election_message {
1520 uint32_t num_connected;
1521 struct timeval priority_time;
1523 uint32_t node_flags;
1527 form this nodes election data
1529 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1532 struct ctdb_node_map_old *nodemap;
1533 struct ctdb_context *ctdb = rec->ctdb;
1537 em->pnn = rec->ctdb->pnn;
1538 em->priority_time = rec->priority_time;
1540 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1542 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1546 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1547 em->node_flags = rec->node_flags;
1549 for (i=0;i<nodemap->num;i++) {
1550 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1551 em->num_connected++;
1555 /* we shouldnt try to win this election if we cant be a recmaster */
1556 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1557 em->num_connected = 0;
1558 em->priority_time = timeval_current();
1561 talloc_free(nodemap);
1565 see if the given election data wins
1567 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1569 struct election_message myem;
1572 ctdb_election_data(rec, &myem);
1574 /* we cant win if we don't have the recmaster capability */
1575 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1579 /* we cant win if we are banned */
1580 if (rec->node_flags & NODE_FLAGS_BANNED) {
1584 /* we cant win if we are stopped */
1585 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1589 /* we will automatically win if the other node is banned */
1590 if (em->node_flags & NODE_FLAGS_BANNED) {
1594 /* we will automatically win if the other node is banned */
1595 if (em->node_flags & NODE_FLAGS_STOPPED) {
1599 /* then the longest running node */
1601 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1605 cmp = (int)myem.pnn - (int)em->pnn;
1612 send out an election request
1614 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1617 TDB_DATA election_data;
1618 struct election_message emsg;
1620 struct ctdb_context *ctdb = rec->ctdb;
1622 srvid = CTDB_SRVID_ELECTION;
1624 ctdb_election_data(rec, &emsg);
1626 election_data.dsize = sizeof(struct election_message);
1627 election_data.dptr = (unsigned char *)&emsg;
1630 /* first we assume we will win the election and set
1631 recoverymaster to be ourself on the current node
1633 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1634 CTDB_CURRENT_NODE, pnn);
1636 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1639 rec->recmaster = pnn;
1641 /* send an election message to all active nodes */
1642 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1643 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1647 we think we are winning the election - send a broadcast election request
1649 static void election_send_request(struct tevent_context *ev,
1650 struct tevent_timer *te,
1651 struct timeval t, void *p)
1653 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1656 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1658 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1661 TALLOC_FREE(rec->send_election_te);
1665 handler for memory dumps
1667 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1669 struct ctdb_recoverd *rec = talloc_get_type(
1670 private_data, struct ctdb_recoverd);
1671 struct ctdb_context *ctdb = rec->ctdb;
1672 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1675 struct ctdb_srvid_message *rd;
1677 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1678 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1679 talloc_free(tmp_ctx);
1682 rd = (struct ctdb_srvid_message *)data.dptr;
1684 dump = talloc_zero(tmp_ctx, TDB_DATA);
1686 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1687 talloc_free(tmp_ctx);
1690 ret = ctdb_dump_memory(ctdb, dump);
1692 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1693 talloc_free(tmp_ctx);
1697 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1699 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1701 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1702 talloc_free(tmp_ctx);
1706 talloc_free(tmp_ctx);
1710 handler for reload_nodes
1712 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1715 struct ctdb_recoverd *rec = talloc_get_type(
1716 private_data, struct ctdb_recoverd);
1718 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1720 ctdb_load_nodes_file(rec->ctdb);
1724 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1727 struct ctdb_recoverd *rec = talloc_get_type(
1728 private_data, struct ctdb_recoverd);
1729 struct ctdb_context *ctdb = rec->ctdb;
1734 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1738 if (data.dsize != sizeof(uint32_t)) {
1739 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1743 pnn = *(uint32_t *)&data.dptr[0];
1745 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1747 /* Copy any existing list of nodes. There's probably some
1748 * sort of realloc variant that will do this but we need to
1749 * make sure that freeing the old array also cancels the timer
1750 * event for the timeout... not sure if realloc will do that.
1752 len = (rec->force_rebalance_nodes != NULL) ?
1753 talloc_array_length(rec->force_rebalance_nodes) :
1756 /* This allows duplicates to be added but they don't cause
1757 * harm. A call to add a duplicate PNN arguably means that
1758 * the timeout should be reset, so this is the simplest
1761 t = talloc_zero_array(rec, uint32_t, len+1);
1762 CTDB_NO_MEMORY_VOID(ctdb, t);
1764 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1768 talloc_free(rec->force_rebalance_nodes);
1770 rec->force_rebalance_nodes = t;
1775 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1777 struct ctdb_op_state *op_state)
1779 struct ctdb_disable_message *r;
1784 /* Validate input data */
1785 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1786 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1787 "expecting %lu\n", (long unsigned)data.dsize,
1788 (long unsigned)sizeof(struct ctdb_srvid_message)));
1791 if (data.dptr == NULL) {
1792 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1796 r = (struct ctdb_disable_message *)data.dptr;
1797 timeout = r->timeout;
1799 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1804 /* Returning our PNN tells the caller that we succeeded */
1805 ret = ctdb_get_pnn(ctdb);
1807 result.dsize = sizeof(int32_t);
1808 result.dptr = (uint8_t *)&ret;
1809 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1812 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1815 struct ctdb_recoverd *rec = talloc_get_type(
1816 private_data, struct ctdb_recoverd);
1818 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1821 /* Backward compatibility for this SRVID */
1822 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1825 struct ctdb_recoverd *rec = talloc_get_type(
1826 private_data, struct ctdb_recoverd);
1829 if (data.dsize != sizeof(uint32_t)) {
1830 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1831 "expecting %lu\n", (long unsigned)data.dsize,
1832 (long unsigned)sizeof(uint32_t)));
1835 if (data.dptr == NULL) {
1836 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1840 timeout = *((uint32_t *)data.dptr);
1842 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1845 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1848 struct ctdb_recoverd *rec = talloc_get_type(
1849 private_data, struct ctdb_recoverd);
1851 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1855 handler for ip reallocate, just add it to the list of requests and
1856 handle this later in the monitor_cluster loop so we do not recurse
1857 with other requests to takeover_run()
1859 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1862 struct ctdb_srvid_message *request;
1863 struct ctdb_recoverd *rec = talloc_get_type(
1864 private_data, struct ctdb_recoverd);
1866 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1867 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1871 request = (struct ctdb_srvid_message *)data.dptr;
1873 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1876 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1877 struct ctdb_recoverd *rec)
1881 struct srvid_requests *current;
1883 /* Only process requests that are currently pending. More
1884 * might come in while the takeover run is in progress and
1885 * they will need to be processed later since they might
1886 * be in response flag changes.
1888 current = rec->reallocate_requests;
1889 rec->reallocate_requests = NULL;
1891 if (do_takeover_run(rec, rec->nodemap)) {
1892 ret = ctdb_get_pnn(ctdb);
1897 result.dsize = sizeof(int32_t);
1898 result.dptr = (uint8_t *)&ret;
1900 srvid_requests_reply(ctdb, ¤t, result);
1904 * handler for assigning banning credits
1906 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1908 struct ctdb_recoverd *rec = talloc_get_type(
1909 private_data, struct ctdb_recoverd);
1912 /* Ignore if we are not recmaster */
1913 if (rec->ctdb->pnn != rec->recmaster) {
1917 if (data.dsize != sizeof(uint32_t)) {
1918 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1923 ban_pnn = *(uint32_t *)data.dptr;
1925 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1929 handler for recovery master elections
1931 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1933 struct ctdb_recoverd *rec = talloc_get_type(
1934 private_data, struct ctdb_recoverd);
1935 struct ctdb_context *ctdb = rec->ctdb;
1937 struct election_message *em = (struct election_message *)data.dptr;
1939 /* Ignore election packets from ourself */
1940 if (ctdb->pnn == em->pnn) {
1944 /* we got an election packet - update the timeout for the election */
1945 talloc_free(rec->election_timeout);
1946 rec->election_timeout = tevent_add_timer(
1949 timeval_current_ofs(0, 500000) :
1950 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1951 ctdb_election_timeout, rec);
1953 /* someone called an election. check their election data
1954 and if we disagree and we would rather be the elected node,
1955 send a new election message to all other nodes
1957 if (ctdb_election_win(rec, em)) {
1958 if (!rec->send_election_te) {
1959 rec->send_election_te = tevent_add_timer(
1961 timeval_current_ofs(0, 500000),
1962 election_send_request, rec);
1968 TALLOC_FREE(rec->send_election_te);
1970 /* Release the recovery lock file */
1971 if (ctdb_recovery_have_lock(rec)) {
1972 ctdb_recovery_unlock(rec);
1975 /* ok, let that guy become recmaster then */
1976 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1977 CTDB_CURRENT_NODE, em->pnn);
1979 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1982 rec->recmaster = em->pnn;
1989 force the start of the election process
1991 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1992 struct ctdb_node_map_old *nodemap)
1995 struct ctdb_context *ctdb = rec->ctdb;
1997 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1999 /* set all nodes to recovery mode to stop all internode traffic */
2000 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2002 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2006 talloc_free(rec->election_timeout);
2007 rec->election_timeout = tevent_add_timer(
2010 timeval_current_ofs(0, 500000) :
2011 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2012 ctdb_election_timeout, rec);
2014 ret = send_election_request(rec, pnn);
2016 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2020 /* wait for a few seconds to collect all responses */
2021 ctdb_wait_election(rec);
2027 handler for when a node changes its flags
2029 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2031 struct ctdb_recoverd *rec = talloc_get_type(
2032 private_data, struct ctdb_recoverd);
2033 struct ctdb_context *ctdb = rec->ctdb;
2035 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2036 struct ctdb_node_map_old *nodemap=NULL;
2037 TALLOC_CTX *tmp_ctx;
2040 if (data.dsize != sizeof(*c)) {
2041 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2045 tmp_ctx = talloc_new(ctdb);
2046 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2048 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2050 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2051 talloc_free(tmp_ctx);
2056 for (i=0;i<nodemap->num;i++) {
2057 if (nodemap->nodes[i].pnn == c->pnn) break;
2060 if (i == nodemap->num) {
2061 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2062 talloc_free(tmp_ctx);
2066 if (c->old_flags != c->new_flags) {
2067 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2070 nodemap->nodes[i].flags = c->new_flags;
2072 talloc_free(tmp_ctx);
2076 handler for when we need to push out flag changes ot all other nodes
2078 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2081 struct ctdb_recoverd *rec = talloc_get_type(
2082 private_data, struct ctdb_recoverd);
2083 struct ctdb_context *ctdb = rec->ctdb;
2085 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2086 struct ctdb_node_map_old *nodemap=NULL;
2087 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2090 /* read the node flags from the recmaster */
2091 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2094 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2095 talloc_free(tmp_ctx);
2098 if (c->pnn >= nodemap->num) {
2099 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2100 talloc_free(tmp_ctx);
2104 /* send the flags update to all connected nodes */
2105 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2107 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2108 nodes, 0, CONTROL_TIMEOUT(),
2112 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2114 talloc_free(tmp_ctx);
2118 talloc_free(tmp_ctx);
2122 struct verify_recmode_normal_data {
2124 enum monitor_result status;
2127 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2129 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2132 /* one more node has responded with recmode data*/
2135 /* if we failed to get the recmode, then return an error and let
2136 the main loop try again.
2138 if (state->state != CTDB_CONTROL_DONE) {
2139 if (rmdata->status == MONITOR_OK) {
2140 rmdata->status = MONITOR_FAILED;
2145 /* if we got a response, then the recmode will be stored in the
2148 if (state->status != CTDB_RECOVERY_NORMAL) {
2149 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2150 rmdata->status = MONITOR_RECOVERY_NEEDED;
2157 /* verify that all nodes are in normal recovery mode */
2158 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2160 struct verify_recmode_normal_data *rmdata;
2161 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2162 struct ctdb_client_control_state *state;
2163 enum monitor_result status;
2166 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2167 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2169 rmdata->status = MONITOR_OK;
2171 /* loop over all active nodes and send an async getrecmode call to
2173 for (j=0; j<nodemap->num; j++) {
2174 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2177 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2179 nodemap->nodes[j].pnn);
2180 if (state == NULL) {
2181 /* we failed to send the control, treat this as
2182 an error and try again next iteration
2184 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2185 talloc_free(mem_ctx);
2186 return MONITOR_FAILED;
2189 /* set up the callback functions */
2190 state->async.fn = verify_recmode_normal_callback;
2191 state->async.private_data = rmdata;
2193 /* one more control to wait for to complete */
2198 /* now wait for up to the maximum number of seconds allowed
2199 or until all nodes we expect a response from has replied
2201 while (rmdata->count > 0) {
2202 tevent_loop_once(ctdb->ev);
2205 status = rmdata->status;
2206 talloc_free(mem_ctx);
2211 struct verify_recmaster_data {
2212 struct ctdb_recoverd *rec;
2215 enum monitor_result status;
2218 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2220 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2223 /* one more node has responded with recmaster data*/
2226 /* if we failed to get the recmaster, then return an error and let
2227 the main loop try again.
2229 if (state->state != CTDB_CONTROL_DONE) {
2230 if (rmdata->status == MONITOR_OK) {
2231 rmdata->status = MONITOR_FAILED;
2236 /* if we got a response, then the recmaster will be stored in the
2239 if (state->status != rmdata->pnn) {
2240 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2241 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2242 rmdata->status = MONITOR_ELECTION_NEEDED;
2249 /* verify that all nodes agree that we are the recmaster */
2250 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2252 struct ctdb_context *ctdb = rec->ctdb;
2253 struct verify_recmaster_data *rmdata;
2254 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2255 struct ctdb_client_control_state *state;
2256 enum monitor_result status;
2259 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2260 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2264 rmdata->status = MONITOR_OK;
2266 /* loop over all active nodes and send an async getrecmaster call to
2268 for (j=0; j<nodemap->num; j++) {
2269 if (nodemap->nodes[j].pnn == rec->recmaster) {
2272 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2275 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2277 nodemap->nodes[j].pnn);
2278 if (state == NULL) {
2279 /* we failed to send the control, treat this as
2280 an error and try again next iteration
2282 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2283 talloc_free(mem_ctx);
2284 return MONITOR_FAILED;
2287 /* set up the callback functions */
2288 state->async.fn = verify_recmaster_callback;
2289 state->async.private_data = rmdata;
2291 /* one more control to wait for to complete */
2296 /* now wait for up to the maximum number of seconds allowed
2297 or until all nodes we expect a response from has replied
2299 while (rmdata->count > 0) {
2300 tevent_loop_once(ctdb->ev);
2303 status = rmdata->status;
2304 talloc_free(mem_ctx);
2308 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2309 struct ctdb_recoverd *rec)
2311 struct ctdb_iface_list_old *ifaces = NULL;
2312 TALLOC_CTX *mem_ctx;
2315 mem_ctx = talloc_new(NULL);
2317 /* Read the interfaces from the local node */
2318 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2319 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2320 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2321 /* We could return an error. However, this will be
2322 * rare so we'll decide that the interfaces have
2323 * actually changed, just in case.
2325 talloc_free(mem_ctx);
2330 /* We haven't been here before so things have changed */
2331 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2333 } else if (rec->ifaces->num != ifaces->num) {
2334 /* Number of interfaces has changed */
2335 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2336 rec->ifaces->num, ifaces->num));
2339 /* See if interface names or link states have changed */
2341 for (i = 0; i < rec->ifaces->num; i++) {
2342 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2343 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2345 ("Interface in slot %d changed: %s => %s\n",
2346 i, iface->name, ifaces->ifaces[i].name));
2350 if (iface->link_state != ifaces->ifaces[i].link_state) {
2352 ("Interface %s changed state: %d => %d\n",
2353 iface->name, iface->link_state,
2354 ifaces->ifaces[i].link_state));
2361 talloc_free(rec->ifaces);
2362 rec->ifaces = talloc_steal(rec, ifaces);
2364 talloc_free(mem_ctx);
2368 /* Check that the local allocation of public IP addresses is correct
2369 * and do some house-keeping */
2370 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2371 struct ctdb_recoverd *rec,
2373 struct ctdb_node_map_old *nodemap)
2375 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2377 bool need_takeover_run = false;
2378 struct ctdb_public_ip_list_old *ips = NULL;
2380 /* If we are not the recmaster then do some housekeeping */
2381 if (rec->recmaster != pnn) {
2382 /* Ignore any IP reallocate requests - only recmaster
2385 TALLOC_FREE(rec->reallocate_requests);
2386 /* Clear any nodes that should be force rebalanced in
2387 * the next takeover run. If the recovery master role
2388 * has moved then we don't want to process these some
2389 * time in the future.
2391 TALLOC_FREE(rec->force_rebalance_nodes);
2394 /* Return early if disabled... */
2395 if (ctdb_config.failover_disabled ||
2396 ctdb_op_is_disabled(rec->takeover_run)) {
2400 if (interfaces_have_changed(ctdb, rec)) {
2401 need_takeover_run = true;
2404 /* If there are unhosted IPs but this node can host them then
2405 * trigger an IP reallocation */
2407 /* Read *available* IPs from local node */
2408 ret = ctdb_ctrl_get_public_ips_flags(
2409 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2410 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2412 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2413 talloc_free(mem_ctx);
2417 for (j=0; j<ips->num; j++) {
2418 if (ips->ips[j].pnn == -1 &&
2419 nodemap->nodes[pnn].flags == 0) {
2420 DEBUG(DEBUG_WARNING,
2421 ("Unassigned IP %s can be served by this node\n",
2422 ctdb_addr_to_str(&ips->ips[j].addr)));
2423 need_takeover_run = true;
2429 if (!ctdb->do_checkpublicip) {
2433 /* Validate the IP addresses that this node has on network
2434 * interfaces. If there is an inconsistency between reality
2435 * and the state expected by CTDB then try to fix it by
2436 * triggering an IP reallocation or releasing extraneous IP
2439 /* Read *known* IPs from local node */
2440 ret = ctdb_ctrl_get_public_ips_flags(
2441 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2443 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2444 talloc_free(mem_ctx);
2448 for (j=0; j<ips->num; j++) {
2449 if (ips->ips[j].pnn == pnn) {
2450 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2452 ("Assigned IP %s not on an interface\n",
2453 ctdb_addr_to_str(&ips->ips[j].addr)));
2454 need_takeover_run = true;
2457 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2459 ("IP %s incorrectly on an interface\n",
2460 ctdb_addr_to_str(&ips->ips[j].addr)));
2461 need_takeover_run = true;
2467 if (need_takeover_run) {
2468 struct ctdb_srvid_message rd;
2471 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2476 data.dptr = (uint8_t *)&rd;
2477 data.dsize = sizeof(rd);
2479 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2482 ("Failed to send takeover run request\n"));
2485 talloc_free(mem_ctx);
2490 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2492 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2494 if (node_pnn >= ctdb->num_nodes) {
2495 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2499 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2503 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2504 struct ctdb_node_map_old *nodemap,
2505 struct ctdb_node_map_old **remote_nodemaps)
2509 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2510 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2512 CONTROL_TIMEOUT(), false, tdb_null,
2513 async_getnodemap_callback,
2515 remote_nodemaps) != 0) {
2516 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2524 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2525 TALLOC_CTX *mem_ctx)
2527 struct ctdb_context *ctdb = rec->ctdb;
2528 uint32_t pnn = ctdb_get_pnn(ctdb);
2529 struct ctdb_node_map_old *nodemap = rec->nodemap;
2530 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2533 /* When recovery daemon is started, recmaster is set to
2534 * "unknown" so it knows to start an election.
2536 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2538 ("Initial recovery master set - forcing election\n"));
2539 force_election(rec, pnn, nodemap);
2544 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2545 * but we have, then force an election and try to become the new
2548 if (!ctdb_node_has_capabilities(rec->caps,
2550 CTDB_CAP_RECMASTER) &&
2551 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2552 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2554 (" Current recmaster node %u does not have CAP_RECMASTER,"
2555 " but we (node %u) have - force an election\n",
2556 rec->recmaster, pnn));
2557 force_election(rec, pnn, nodemap);
2561 /* Verify that the master node has not been deleted. This
2562 * should not happen because a node should always be shutdown
2563 * before being deleted, causing a new master to be elected
2564 * before now. However, if something strange has happened
2565 * then checking here will ensure we don't index beyond the
2566 * end of the nodemap array. */
2567 if (rec->recmaster >= nodemap->num) {
2569 ("Recmaster node %u has been deleted. Force election\n",
2571 force_election(rec, pnn, nodemap);
2575 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2576 if (nodemap->nodes[rec->recmaster].flags &
2577 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2579 ("Recmaster node %u is disconnected/deleted. Force election\n",
2581 force_election(rec, pnn, nodemap);
2585 /* get nodemap from the recovery master to check if it is inactive */
2586 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2587 mem_ctx, &recmaster_nodemap);
2591 " Unable to get nodemap from recovery master %u\n",
2593 /* No election, just error */
2598 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2599 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2601 ("Recmaster node %u is inactive. Force election\n",
2604 * update our nodemap to carry the recmaster's notion of
2605 * its own flags, so that we don't keep freezing the
2606 * inactive recmaster node...
2608 nodemap->nodes[rec->recmaster].flags =
2609 recmaster_nodemap->nodes[rec->recmaster].flags;
2610 force_election(rec, pnn, nodemap);
2617 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2618 TALLOC_CTX *mem_ctx)
2621 struct ctdb_node_map_old *nodemap=NULL;
2622 struct ctdb_node_map_old **remote_nodemaps=NULL;
2623 struct ctdb_vnn_map *vnnmap=NULL;
2624 struct ctdb_vnn_map *remote_vnnmap=NULL;
2625 uint32_t num_lmasters;
2626 int32_t debug_level;
2631 /* verify that the main daemon is still running */
2632 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2633 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2637 /* ping the local daemon to tell it we are alive */
2638 ctdb_ctrl_recd_ping(ctdb);
2640 if (rec->election_timeout) {
2641 /* an election is in progress */
2645 /* read the debug level from the parent and update locally */
2646 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2648 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2651 debuglevel_set(debug_level);
2653 /* get relevant tunables */
2654 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2656 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2661 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2662 CTDB_CURRENT_NODE, &ctdb->runstate);
2664 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2668 pnn = ctdb_get_pnn(ctdb);
2671 TALLOC_FREE(rec->nodemap);
2672 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2674 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2677 nodemap = rec->nodemap;
2679 /* remember our own node flags */
2680 rec->node_flags = nodemap->nodes[pnn].flags;
2682 ban_misbehaving_nodes(rec, &self_ban);
2684 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2688 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2689 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2691 D_ERR("Failed to read recmode from local node\n");
2695 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2696 also frozen and that the recmode is set to active.
2698 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2699 /* If this node has become inactive then we want to
2700 * reduce the chances of it taking over the recovery
2701 * master role when it becomes active again. This
2702 * helps to stabilise the recovery master role so that
2703 * it stays on the most stable node.
2705 rec->priority_time = timeval_current();
2707 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2708 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2710 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2712 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2717 if (! rec->frozen_on_inactive) {
2718 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2722 (__location__ " Failed to freeze node "
2723 "in STOPPED or BANNED state\n"));
2727 rec->frozen_on_inactive = true;
2730 /* If this node is stopped or banned then it is not the recovery
2731 * master, so don't do anything. This prevents stopped or banned
2732 * node from starting election and sending unnecessary controls.
2737 rec->frozen_on_inactive = false;
2739 /* Retrieve capabilities from all connected nodes */
2740 ret = update_capabilities(rec, nodemap);
2742 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2746 if (! validate_recovery_master(rec, mem_ctx)) {
2750 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2751 /* Check if an IP takeover run is needed and trigger one if
2753 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2756 /* if we are not the recmaster then we do not need to check
2757 if recovery is needed
2759 if (pnn != rec->recmaster) {
2764 /* ensure our local copies of flags are right */
2765 ret = update_local_flags(rec, nodemap);
2767 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2771 if (ctdb->num_nodes != nodemap->num) {
2772 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2773 ctdb_load_nodes_file(ctdb);
2777 /* verify that all active nodes agree that we are the recmaster */
2778 switch (verify_recmaster(rec, nodemap, pnn)) {
2779 case MONITOR_RECOVERY_NEEDED:
2780 /* can not happen */
2782 case MONITOR_ELECTION_NEEDED:
2783 force_election(rec, pnn, nodemap);
2787 case MONITOR_FAILED:
2792 /* get the vnnmap */
2793 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2795 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2799 if (rec->need_recovery) {
2800 /* a previous recovery didn't finish */
2801 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2805 /* verify that all active nodes are in normal mode
2806 and not in recovery mode
2808 switch (verify_recmode(ctdb, nodemap)) {
2809 case MONITOR_RECOVERY_NEEDED:
2810 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2812 case MONITOR_FAILED:
2814 case MONITOR_ELECTION_NEEDED:
2815 /* can not happen */
2821 if (ctdb->recovery_lock != NULL) {
2822 /* We must already hold the recovery lock */
2823 if (!ctdb_recovery_have_lock(rec)) {
2824 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2825 ctdb_set_culprit(rec, ctdb->pnn);
2826 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2832 /* If recoveries are disabled then there is no use doing any
2833 * nodemap or flags checks. Recoveries might be disabled due
2834 * to "reloadnodes", so doing these checks might cause an
2835 * unnecessary recovery. */
2836 if (ctdb_op_is_disabled(rec->recovery)) {
2837 goto takeover_run_checks;
2840 /* get the nodemap for all active remote nodes
2842 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2843 if (remote_nodemaps == NULL) {
2844 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2847 for(i=0; i<nodemap->num; i++) {
2848 remote_nodemaps[i] = NULL;
2850 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2851 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2855 /* verify that all other nodes have the same nodemap as we have
2857 for (j=0; j<nodemap->num; j++) {
2858 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2862 if (remote_nodemaps[j] == NULL) {
2863 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2864 ctdb_set_culprit(rec, j);
2869 /* if the nodes disagree on how many nodes there are
2870 then this is a good reason to try recovery
2872 if (remote_nodemaps[j]->num != nodemap->num) {
2873 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2874 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2875 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2876 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2880 /* if the nodes disagree on which nodes exist and are
2881 active, then that is also a good reason to do recovery
2883 for (i=0;i<nodemap->num;i++) {
2884 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2885 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2886 nodemap->nodes[j].pnn, i,
2887 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2888 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2889 do_recovery(rec, mem_ctx, pnn, nodemap,
2897 * Update node flags obtained from each active node. This ensure we have
2898 * up-to-date information for all the nodes.
2900 for (j=0; j<nodemap->num; j++) {
2901 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2904 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2907 for (j=0; j<nodemap->num; j++) {
2908 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2912 /* verify the flags are consistent
2914 for (i=0; i<nodemap->num; i++) {
2915 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2919 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2920 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2921 nodemap->nodes[j].pnn,
2922 nodemap->nodes[i].pnn,
2923 remote_nodemaps[j]->nodes[i].flags,
2924 nodemap->nodes[i].flags));
2926 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2927 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2928 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2929 do_recovery(rec, mem_ctx, pnn, nodemap,
2933 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2934 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2935 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2936 do_recovery(rec, mem_ctx, pnn, nodemap,
2945 /* count how many active nodes there are */
2947 for (i=0; i<nodemap->num; i++) {
2948 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2949 if (ctdb_node_has_capabilities(rec->caps,
2950 ctdb->nodes[i]->pnn,
2951 CTDB_CAP_LMASTER)) {
2958 /* There must be the same number of lmasters in the vnn map as
2959 * there are active nodes with the lmaster capability... or
2962 if (vnnmap->size != num_lmasters) {
2963 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2964 vnnmap->size, num_lmasters));
2965 ctdb_set_culprit(rec, ctdb->pnn);
2966 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2970 /* verify that all active nodes in the nodemap also exist in
2973 for (j=0; j<nodemap->num; j++) {
2974 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2977 if (nodemap->nodes[j].pnn == pnn) {
2981 for (i=0; i<vnnmap->size; i++) {
2982 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2986 if (i == vnnmap->size) {
2987 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2988 nodemap->nodes[j].pnn));
2989 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2990 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2996 /* verify that all other nodes have the same vnnmap
2997 and are from the same generation
2999 for (j=0; j<nodemap->num; j++) {
3000 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3003 if (nodemap->nodes[j].pnn == pnn) {
3007 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3008 mem_ctx, &remote_vnnmap);
3010 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3011 nodemap->nodes[j].pnn));
3015 /* verify the vnnmap generation is the same */
3016 if (vnnmap->generation != remote_vnnmap->generation) {
3017 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3018 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3019 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3020 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3024 /* verify the vnnmap size is the same */
3025 if (vnnmap->size != remote_vnnmap->size) {
3026 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3027 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3028 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3029 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3033 /* verify the vnnmap is the same */
3034 for (i=0;i<vnnmap->size;i++) {
3035 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3036 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3037 nodemap->nodes[j].pnn));
3038 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3039 do_recovery(rec, mem_ctx, pnn, nodemap,
3046 /* FIXME: Add remote public IP checking to ensure that nodes
3047 * have the IP addresses that are allocated to them. */
3049 takeover_run_checks:
3051 /* If there are IP takeover runs requested or the previous one
3052 * failed then perform one and notify the waiters */
3053 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3054 (rec->reallocate_requests || rec->need_takeover_run)) {
3055 process_ipreallocate_requests(ctdb, rec);
3059 static void recd_sig_term_handler(struct tevent_context *ev,
3060 struct tevent_signal *se, int signum,
3061 int count, void *dont_care,
3064 struct ctdb_recoverd *rec = talloc_get_type_abort(
3065 private_data, struct ctdb_recoverd);
3067 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3068 ctdb_recovery_unlock(rec);
3074 the main monitoring loop
3076 static void monitor_cluster(struct ctdb_context *ctdb)
3078 struct tevent_signal *se;
3079 struct ctdb_recoverd *rec;
3081 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3083 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3084 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3087 rec->recmaster = CTDB_UNKNOWN_PNN;
3088 rec->recovery_lock_handle = NULL;
3090 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3091 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3093 rec->recovery = ctdb_op_init(rec, "recoveries");
3094 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3096 rec->priority_time = timeval_current();
3097 rec->frozen_on_inactive = false;
3099 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3100 recd_sig_term_handler, rec);
3102 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3106 /* register a message port for sending memory dumps */
3107 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3109 /* when a node is assigned banning credits */
3110 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3111 banning_handler, rec);
3113 /* register a message port for recovery elections */
3114 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3116 /* when nodes are disabled/enabled */
3117 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3119 /* when we are asked to puch out a flag change */
3120 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3122 /* register a message port for vacuum fetch */
3123 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3125 /* register a message port for reloadnodes */
3126 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3128 /* register a message port for performing a takeover run */
3129 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3131 /* register a message port for disabling the ip check for a short while */
3132 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3134 /* register a message port for forcing a rebalance of a node next
3136 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3138 /* Register a message port for disabling takeover runs */
3139 ctdb_client_set_message_handler(ctdb,
3140 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3141 disable_takeover_runs_handler, rec);
3143 /* Register a message port for disabling recoveries */
3144 ctdb_client_set_message_handler(ctdb,
3145 CTDB_SRVID_DISABLE_RECOVERIES,
3146 disable_recoveries_handler, rec);
3148 /* register a message port for detaching database */
3149 ctdb_client_set_message_handler(ctdb,
3150 CTDB_SRVID_DETACH_DATABASE,
3151 detach_database_handler, rec);
3154 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3155 struct timeval start;
3159 DEBUG(DEBUG_CRIT,(__location__
3160 " Failed to create temp context\n"));
3164 start = timeval_current();
3165 main_loop(ctdb, rec, mem_ctx);
3166 talloc_free(mem_ctx);
3168 /* we only check for recovery once every second */
3169 elapsed = timeval_elapsed(&start);
3170 if (elapsed < ctdb->tunable.recover_interval) {
3171 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3178 event handler for when the main ctdbd dies
3180 static void ctdb_recoverd_parent(struct tevent_context *ev,
3181 struct tevent_fd *fde,
3182 uint16_t flags, void *private_data)
3184 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3189 called regularly to verify that the recovery daemon is still running
3191 static void ctdb_check_recd(struct tevent_context *ev,
3192 struct tevent_timer *te,
3193 struct timeval yt, void *p)
3195 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3197 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3198 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3200 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3201 ctdb_restart_recd, ctdb);
3206 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3207 timeval_current_ofs(30, 0),
3208 ctdb_check_recd, ctdb);
3211 static void recd_sig_child_handler(struct tevent_context *ev,
3212 struct tevent_signal *se, int signum,
3213 int count, void *dont_care,
3216 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3221 pid = waitpid(-1, &status, WNOHANG);
3223 if (errno != ECHILD) {
3224 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3229 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3235 startup the recovery daemon as a child of the main ctdb daemon
3237 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3240 struct tevent_signal *se;
3241 struct tevent_fd *fde;
3244 if (pipe(fd) != 0) {
3248 ctdb->recoverd_pid = ctdb_fork(ctdb);
3249 if (ctdb->recoverd_pid == -1) {
3253 if (ctdb->recoverd_pid != 0) {
3254 talloc_free(ctdb->recd_ctx);
3255 ctdb->recd_ctx = talloc_new(ctdb);
3256 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3259 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3260 timeval_current_ofs(30, 0),
3261 ctdb_check_recd, ctdb);
3267 srandom(getpid() ^ time(NULL));
3269 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3274 prctl_set_comment("ctdb_recoverd");
3275 if (switch_from_server_to_client(ctdb) != 0) {
3276 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3280 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3282 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3283 ctdb_recoverd_parent, &fd[0]);
3284 tevent_fd_set_auto_close(fde);
3286 /* set up a handler to pick up sigchld */
3287 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3288 recd_sig_child_handler, ctdb);
3290 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3294 monitor_cluster(ctdb);
3296 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3301 shutdown the recovery daemon
3303 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3305 if (ctdb->recoverd_pid == 0) {
3309 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3310 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3312 TALLOC_FREE(ctdb->recd_ctx);
3313 TALLOC_FREE(ctdb->recd_ping_count);
3316 static void ctdb_restart_recd(struct tevent_context *ev,
3317 struct tevent_timer *te,
3318 struct timeval t, void *private_data)
3320 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3322 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3323 ctdb_stop_recoverd(ctdb);
3324 ctdb_start_recoverd(ctdb);