4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
49 struct srvid_list *next, *prev;
50 struct ctdb_srvid_message *request;
53 struct srvid_requests {
54 struct srvid_list *requests;
57 static void srvid_request_reply(struct ctdb_context *ctdb,
58 struct ctdb_srvid_message *request,
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request->srvid == 0) {
67 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
69 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request->pnn,
71 (unsigned long long)request->srvid));
73 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request->pnn,
75 (unsigned long long)request->srvid));
81 static void srvid_requests_reply(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
87 if (*requests == NULL) {
91 for (r = (*requests)->requests; r != NULL; r = r->next) {
92 srvid_request_reply(ctdb, r->request, result);
95 /* Free the list structure... */
96 TALLOC_FREE(*requests);
99 static void srvid_request_add(struct ctdb_context *ctdb,
100 struct srvid_requests **requests,
101 struct ctdb_srvid_message *request)
103 struct srvid_list *t;
107 if (*requests == NULL) {
108 *requests = talloc_zero(ctdb, struct srvid_requests);
109 if (*requests == NULL) {
114 t = talloc_zero(*requests, struct srvid_list);
116 /* If *requests was just allocated above then free it */
117 if ((*requests)->requests == NULL) {
118 TALLOC_FREE(*requests);
123 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
124 DLIST_ADD((*requests)->requests, t);
129 /* Failed to add the request to the list. Send a fail. */
130 DEBUG(DEBUG_ERR, (__location__
131 " Out of memory, failed to queue SRVID request\n"));
133 result.dsize = sizeof(ret);
134 result.dptr = (uint8_t *)&ret;
135 srvid_request_reply(ctdb, request, result);
138 /* An abstraction to allow an operation (takeover runs, recoveries,
139 * ...) to be disabled for a given timeout */
140 struct ctdb_op_state {
141 struct tevent_timer *timer;
146 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
148 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
151 state->in_progress = false;
158 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
160 return state->timer != NULL;
163 static bool ctdb_op_begin(struct ctdb_op_state *state)
165 if (ctdb_op_is_disabled(state)) {
167 ("Unable to begin - %s are disabled\n", state->name));
171 state->in_progress = true;
175 static bool ctdb_op_end(struct ctdb_op_state *state)
177 return state->in_progress = false;
180 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
182 return state->in_progress;
185 static void ctdb_op_enable(struct ctdb_op_state *state)
187 TALLOC_FREE(state->timer);
190 static void ctdb_op_timeout_handler(struct tevent_context *ev,
191 struct tevent_timer *te,
192 struct timeval yt, void *p)
194 struct ctdb_op_state *state =
195 talloc_get_type(p, struct ctdb_op_state);
197 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
198 ctdb_op_enable(state);
201 static int ctdb_op_disable(struct ctdb_op_state *state,
202 struct tevent_context *ev,
206 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
207 ctdb_op_enable(state);
211 if (state->in_progress) {
213 ("Unable to disable %s - in progress\n", state->name));
217 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
218 state->name, timeout));
220 /* Clear any old timers */
221 talloc_free(state->timer);
223 /* Arrange for the timeout to occur */
224 state->timer = tevent_add_timer(ev, state,
225 timeval_current_ofs(timeout, 0),
226 ctdb_op_timeout_handler, state);
227 if (state->timer == NULL) {
228 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
235 struct ctdb_banning_state {
237 struct timeval last_reported_time;
241 private state of recovery daemon
243 struct ctdb_recoverd {
244 struct ctdb_context *ctdb;
246 uint32_t last_culprit_node;
247 struct ctdb_node_map_old *nodemap;
248 struct timeval priority_time;
249 bool need_takeover_run;
252 struct tevent_timer *send_election_te;
253 struct tevent_timer *election_timeout;
254 struct srvid_requests *reallocate_requests;
255 struct ctdb_op_state *takeover_run;
256 struct ctdb_op_state *recovery;
257 struct ctdb_iface_list_old *ifaces;
258 uint32_t *force_rebalance_nodes;
259 struct ctdb_node_capabilities *caps;
260 bool frozen_on_inactive;
261 struct ctdb_cluster_mutex_handle *recovery_lock_handle;
264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
267 static void ctdb_restart_recd(struct tevent_context *ev,
268 struct tevent_timer *te, struct timeval t,
272 ban a node for a period of time
274 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
277 struct ctdb_context *ctdb = rec->ctdb;
278 struct ctdb_ban_state bantime;
280 if (!ctdb_validate_pnn(ctdb, pnn)) {
281 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
285 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
288 bantime.time = ban_time;
290 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
292 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
298 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
302 remember the trouble maker
304 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
306 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
307 struct ctdb_banning_state *ban_state;
309 if (culprit > ctdb->num_nodes) {
310 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
314 /* If we are banned or stopped, do not set other nodes as culprits */
315 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
316 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
320 if (ctdb->nodes[culprit]->ban_state == NULL) {
321 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
322 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
326 ban_state = ctdb->nodes[culprit]->ban_state;
327 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
328 /* this was the first time in a long while this node
329 misbehaved so we will forgive any old transgressions.
331 ban_state->count = 0;
334 ban_state->count += count;
335 ban_state->last_reported_time = timeval_current();
336 rec->last_culprit_node = culprit;
340 remember the trouble maker
342 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
344 ctdb_set_culprit_count(rec, culprit, 1);
348 Retrieve capabilities from all connected nodes
350 static int update_capabilities(struct ctdb_recoverd *rec,
351 struct ctdb_node_map_old *nodemap)
355 struct ctdb_node_capabilities *caps;
356 struct ctdb_context *ctdb = rec->ctdb;
358 tmp_ctx = talloc_new(rec);
359 CTDB_NO_MEMORY(ctdb, tmp_ctx);
361 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
362 CONTROL_TIMEOUT(), nodemap);
366 (__location__ " Failed to get node capabilities\n"));
367 talloc_free(tmp_ctx);
371 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
375 " Capabilities don't include current node.\n"));
376 talloc_free(tmp_ctx);
379 ctdb->capabilities = *capp;
381 TALLOC_FREE(rec->caps);
382 rec->caps = talloc_steal(rec, caps);
384 talloc_free(tmp_ctx);
389 change recovery mode on all nodes
391 static int set_recovery_mode(struct ctdb_context *ctdb,
392 struct ctdb_recoverd *rec,
393 struct ctdb_node_map_old *nodemap,
400 tmp_ctx = talloc_new(ctdb);
401 CTDB_NO_MEMORY(ctdb, tmp_ctx);
403 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
405 data.dsize = sizeof(uint32_t);
406 data.dptr = (unsigned char *)&rec_mode;
408 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
414 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
415 talloc_free(tmp_ctx);
419 talloc_free(tmp_ctx);
424 ensure all other nodes have attached to any databases that we have
426 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
427 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
430 struct ctdb_dbid_map_old *remote_dbmap;
432 /* verify that all other nodes have all our databases */
433 for (j=0; j<nodemap->num; j++) {
434 /* we don't need to ourself ourselves */
435 if (nodemap->nodes[j].pnn == pnn) {
438 /* don't check nodes that are unavailable */
439 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
443 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
444 mem_ctx, &remote_dbmap);
446 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
450 /* step through all local databases */
451 for (db=0; db<dbmap->num;db++) {
455 for (i=0;i<remote_dbmap->num;i++) {
456 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
460 /* the remote node already have this database */
461 if (i!=remote_dbmap->num) {
464 /* ok so we need to create this database */
465 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
466 dbmap->dbs[db].db_id, mem_ctx,
469 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
472 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
473 nodemap->nodes[j].pnn,
475 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
477 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
488 ensure we are attached to any databases that anyone else is attached to
490 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
491 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
494 struct ctdb_dbid_map_old *remote_dbmap;
496 /* verify that we have all database any other node has */
497 for (j=0; j<nodemap->num; j++) {
498 /* we don't need to ourself ourselves */
499 if (nodemap->nodes[j].pnn == pnn) {
502 /* don't check nodes that are unavailable */
503 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
507 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
508 mem_ctx, &remote_dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
514 /* step through all databases on the remote node */
515 for (db=0; db<remote_dbmap->num;db++) {
518 for (i=0;i<(*dbmap)->num;i++) {
519 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
523 /* we already have this db locally */
524 if (i!=(*dbmap)->num) {
527 /* ok so we need to create this database and
530 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
531 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
533 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
534 nodemap->nodes[j].pnn));
537 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
538 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
540 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
543 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
545 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
555 update flags on all active nodes
557 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
561 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
563 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
571 called when a vacuum fetch has completed - just free it and do the next one
573 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
580 * Process one elements of the vacuum fetch list:
581 * Migrate it over to us with the special flag
582 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
584 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
586 struct ctdb_rec_data_old *r)
588 struct ctdb_client_call_state *state;
590 struct ctdb_ltdb_header *hdr;
591 struct ctdb_call call;
594 call.call_id = CTDB_NULL_FUNC;
595 call.flags = CTDB_IMMEDIATE_MIGRATION;
596 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
598 call.key.dptr = &r->data[0];
599 call.key.dsize = r->keylen;
601 /* ensure we don't block this daemon - just skip a record if we can't get
603 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
607 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
608 if (data.dptr == NULL) {
609 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
613 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
615 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
619 hdr = (struct ctdb_ltdb_header *)data.dptr;
620 if (hdr->dmaster == pnn) {
621 /* its already local */
623 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
629 state = ctdb_call_send(ctdb_db, &call);
630 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
632 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
635 state->async.fn = vacuum_fetch_callback;
636 state->async.private_data = NULL;
643 handler for vacuum fetch
645 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
648 struct ctdb_recoverd *rec = talloc_get_type(
649 private_data, struct ctdb_recoverd);
650 struct ctdb_context *ctdb = rec->ctdb;
651 struct ctdb_marshall_buffer *recs;
653 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
655 struct ctdb_dbid_map_old *dbmap=NULL;
656 bool persistent = false;
657 struct ctdb_db_context *ctdb_db;
658 struct ctdb_rec_data_old *r;
660 recs = (struct ctdb_marshall_buffer *)data.dptr;
662 if (recs->count == 0) {
666 /* work out if the database is persistent */
667 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
669 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
673 for (i=0;i<dbmap->num;i++) {
674 if (dbmap->dbs[i].db_id == recs->db_id) {
675 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
679 if (i == dbmap->num) {
680 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
684 /* find the name of this database */
685 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
686 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
691 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
692 if (ctdb_db == NULL) {
693 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
697 r = (struct ctdb_rec_data_old *)&recs->data[0];
698 while (recs->count) {
701 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
706 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
711 talloc_free(tmp_ctx);
716 * handler for database detach
718 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
721 struct ctdb_recoverd *rec = talloc_get_type(
722 private_data, struct ctdb_recoverd);
723 struct ctdb_context *ctdb = rec->ctdb;
725 struct ctdb_db_context *ctdb_db;
727 if (data.dsize != sizeof(db_id)) {
730 db_id = *(uint32_t *)data.dptr;
732 ctdb_db = find_ctdb_db(ctdb, db_id);
733 if (ctdb_db == NULL) {
734 /* database is not attached */
738 DLIST_REMOVE(ctdb->db_list, ctdb_db);
740 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
742 talloc_free(ctdb_db);
746 called when ctdb_wait_timeout should finish
748 static void ctdb_wait_handler(struct tevent_context *ev,
749 struct tevent_timer *te,
750 struct timeval yt, void *p)
752 uint32_t *timed_out = (uint32_t *)p;
757 wait for a given number of seconds
759 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
761 uint32_t timed_out = 0;
762 time_t usecs = (secs - (time_t)secs) * 1000000;
763 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
764 ctdb_wait_handler, &timed_out);
766 tevent_loop_once(ctdb->ev);
771 called when an election times out (ends)
773 static void ctdb_election_timeout(struct tevent_context *ev,
774 struct tevent_timer *te,
775 struct timeval t, void *p)
777 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
778 rec->election_timeout = NULL;
781 DEBUG(DEBUG_WARNING,("Election period ended\n"));
786 wait for an election to finish. It finished election_timeout seconds after
787 the last election packet is received
789 static void ctdb_wait_election(struct ctdb_recoverd *rec)
791 struct ctdb_context *ctdb = rec->ctdb;
792 while (rec->election_timeout) {
793 tevent_loop_once(ctdb->ev);
798 Update our local flags from all remote connected nodes.
799 This is only run when we are or we belive we are the recovery master
801 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
804 struct ctdb_context *ctdb = rec->ctdb;
805 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
807 /* get the nodemap for all active remote nodes and verify
808 they are the same as for this node
810 for (j=0; j<nodemap->num; j++) {
811 struct ctdb_node_map_old *remote_nodemap=NULL;
814 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
817 if (nodemap->nodes[j].pnn == ctdb->pnn) {
821 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
822 mem_ctx, &remote_nodemap);
824 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
825 nodemap->nodes[j].pnn));
826 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
827 talloc_free(mem_ctx);
830 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
831 /* We should tell our daemon about this so it
832 updates its flags or else we will log the same
833 message again in the next iteration of recovery.
834 Since we are the recovery master we can just as
835 well update the flags on all nodes.
837 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
839 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
843 /* Update our local copy of the flags in the recovery
846 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
847 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
848 nodemap->nodes[j].flags));
849 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
851 talloc_free(remote_nodemap);
853 talloc_free(mem_ctx);
858 /* Create a new random generation id.
859 The generation id can not be the INVALID_GENERATION id
861 static uint32_t new_generation(void)
866 generation = random();
868 if (generation != INVALID_GENERATION) {
876 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
878 return (rec->recovery_lock_handle != NULL);
881 struct hold_reclock_state {
887 static void take_reclock_handler(char status,
891 struct hold_reclock_state *s =
892 (struct hold_reclock_state *) private_data;
896 s->latency = latency;
901 ("Unable to take recovery lock - contention\n"));
905 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
909 s->locked = (status == '0') ;
912 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
914 static void lost_reclock_handler(void *private_data)
916 struct ctdb_recoverd *rec = talloc_get_type_abort(
917 private_data, struct ctdb_recoverd);
920 ("Recovery lock helper terminated unexpectedly - "
921 "trying to retake recovery lock\n"));
922 TALLOC_FREE(rec->recovery_lock_handle);
923 if (! ctdb_recovery_lock(rec)) {
924 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
928 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
930 struct ctdb_context *ctdb = rec->ctdb;
931 struct ctdb_cluster_mutex_handle *h;
932 struct hold_reclock_state s = {
938 h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
939 take_reclock_handler, &s,
940 lost_reclock_handler, rec);
946 tevent_loop_once(ctdb->ev);
954 rec->recovery_lock_handle = h;
955 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
961 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
963 if (rec->recovery_lock_handle != NULL) {
964 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
965 TALLOC_FREE(rec->recovery_lock_handle);
969 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
971 struct ctdb_context *ctdb = rec->ctdb;
973 struct ctdb_banning_state *ban_state;
976 for (i=0; i<ctdb->num_nodes; i++) {
977 if (ctdb->nodes[i]->ban_state == NULL) {
980 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
981 if (ban_state->count < 2*ctdb->num_nodes) {
985 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
986 ctdb->nodes[i]->pnn, ban_state->count,
987 ctdb->tunable.recovery_ban_period));
988 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
989 ban_state->count = 0;
991 /* Banning ourself? */
992 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
998 struct helper_state {
1005 static void helper_handler(struct tevent_context *ev,
1006 struct tevent_fd *fde,
1007 uint16_t flags, void *private_data)
1009 struct helper_state *state = talloc_get_type_abort(
1010 private_data, struct helper_state);
1013 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1014 if (ret != sizeof(state->result)) {
1015 state->result = EPIPE;
1021 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1022 const char *prog, const char *arg, const char *type)
1024 struct helper_state *state;
1025 struct tevent_fd *fde;
1029 state = talloc_zero(mem_ctx, struct helper_state);
1030 if (state == NULL) {
1031 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1037 ret = pipe(state->fd);
1040 ("Failed to create pipe for %s helper\n", type));
1044 set_close_on_exec(state->fd[0]);
1047 args = talloc_array(state, const char *, nargs);
1049 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1053 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1054 if (args[0] == NULL) {
1055 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1058 args[1] = rec->ctdb->daemon.name;
1062 if (args[2] == NULL) {
1066 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1067 if (state->pid == -1) {
1069 ("Failed to create child for %s helper\n", type));
1073 close(state->fd[1]);
1076 state->done = false;
1078 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1079 TEVENT_FD_READ, helper_handler, state);
1083 tevent_fd_set_auto_close(fde);
1085 while (!state->done) {
1086 tevent_loop_once(rec->ctdb->ev);
1089 close(state->fd[0]);
1092 if (state->result != 0) {
1096 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1101 if (state->fd[0] != -1) {
1102 close(state->fd[0]);
1104 if (state->fd[1] != -1) {
1105 close(state->fd[1]);
1107 if (state->pid != -1) {
1108 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1116 static bool do_takeover_run(struct ctdb_recoverd *rec,
1117 struct ctdb_node_map_old *nodemap)
1119 uint32_t *nodes = NULL;
1120 struct ctdb_disable_message dtr;
1123 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1127 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1129 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1130 DEBUG(DEBUG_ERR, (__location__
1131 " takeover run already in progress \n"));
1136 if (!ctdb_op_begin(rec->takeover_run)) {
1141 /* Disable IP checks (takeover runs, really) on other nodes
1142 * while doing this takeover run. This will stop those other
1143 * nodes from triggering takeover runs when think they should
1144 * be hosting an IP but it isn't yet on an interface. Don't
1145 * wait for replies since a failure here might cause some
1146 * noise in the logs but will not actually cause a problem.
1149 dtr.srvid = 0; /* No reply */
1152 data.dptr = (uint8_t*)&dtr;
1153 data.dsize = sizeof(dtr);
1155 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1157 /* Disable for 60 seconds. This can be a tunable later if
1161 for (i = 0; i < talloc_array_length(nodes); i++) {
1162 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1163 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1165 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1169 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1170 rec->force_rebalance_nodes);
1172 /* Reenable takeover runs and IP checks on other nodes */
1174 for (i = 0; i < talloc_array_length(nodes); i++) {
1175 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1176 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1178 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1183 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1189 /* Takeover run was successful so clear force rebalance targets */
1190 if (rebalance_nodes == rec->force_rebalance_nodes) {
1191 TALLOC_FREE(rec->force_rebalance_nodes);
1193 DEBUG(DEBUG_WARNING,
1194 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1197 rec->need_takeover_run = !ok;
1199 ctdb_op_end(rec->takeover_run);
1201 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1205 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1207 static char prog[PATH_MAX+1] = "";
1210 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1211 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1212 "ctdb_recovery_helper")) {
1213 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1216 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1218 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1222 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1224 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1228 we are the recmaster, and recovery is needed - start a recovery run
1230 static int do_recovery(struct ctdb_recoverd *rec,
1231 TALLOC_CTX *mem_ctx, uint32_t pnn,
1232 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1234 struct ctdb_context *ctdb = rec->ctdb;
1236 struct ctdb_dbid_map_old *dbmap;
1239 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1241 /* Check if the current node is still the recmaster. It's possible that
1242 * re-election has changed the recmaster.
1244 if (pnn != rec->recmaster) {
1246 ("Recovery master changed to %u, aborting recovery\n",
1251 /* if recovery fails, force it again */
1252 rec->need_recovery = true;
1254 if (!ctdb_op_begin(rec->recovery)) {
1258 if (rec->election_timeout) {
1259 /* an election is in progress */
1260 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1264 ban_misbehaving_nodes(rec, &self_ban);
1266 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1270 if (ctdb->recovery_lock != NULL) {
1271 if (ctdb_recovery_have_lock(rec)) {
1272 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1274 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1275 ctdb->recovery_lock));
1276 if (!ctdb_recovery_lock(rec)) {
1277 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1278 /* If ctdb is trying first recovery, it's
1279 * possible that current node does not know
1280 * yet who the recmaster is.
1282 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1283 " - retrying recovery\n"));
1287 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1288 "and ban ourself for %u seconds\n",
1289 ctdb->tunable.recovery_ban_period));
1290 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1294 ("Recovery lock taken successfully by recovery daemon\n"));
1298 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1300 /* get a list of all databases */
1301 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1303 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1307 /* we do the db creation before we set the recovery mode, so the freeze happens
1308 on all databases we will be dealing with. */
1310 /* verify that we have all the databases any other node has */
1311 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1313 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1317 /* verify that all other nodes have all our databases */
1318 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1320 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1323 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1326 /* Retrieve capabilities from all connected nodes */
1327 ret = update_capabilities(rec, nodemap);
1329 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1334 update all nodes to have the same flags that we have
1336 for (i=0;i<nodemap->num;i++) {
1337 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1341 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1343 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1344 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1346 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1352 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1354 ret = db_recovery_parallel(rec, mem_ctx);
1359 do_takeover_run(rec, nodemap);
1361 /* send a message to all clients telling them that the cluster
1362 has been reconfigured */
1363 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1364 CTDB_SRVID_RECONFIGURE, tdb_null);
1366 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1370 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1372 rec->need_recovery = false;
1373 ctdb_op_end(rec->recovery);
1375 /* we managed to complete a full recovery, make sure to forgive
1376 any past sins by the nodes that could now participate in the
1379 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1380 for (i=0;i<nodemap->num;i++) {
1381 struct ctdb_banning_state *ban_state;
1383 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1387 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1388 if (ban_state == NULL) {
1392 ban_state->count = 0;
1395 /* We just finished a recovery successfully.
1396 We now wait for rerecovery_timeout before we allow
1397 another recovery to take place.
1399 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1400 ctdb_op_disable(rec->recovery, ctdb->ev,
1401 ctdb->tunable.rerecovery_timeout);
1405 ctdb_op_end(rec->recovery);
1411 elections are won by first checking the number of connected nodes, then
1412 the priority time, then the pnn
1414 struct election_message {
1415 uint32_t num_connected;
1416 struct timeval priority_time;
1418 uint32_t node_flags;
1422 form this nodes election data
1424 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1427 struct ctdb_node_map_old *nodemap;
1428 struct ctdb_context *ctdb = rec->ctdb;
1432 em->pnn = rec->ctdb->pnn;
1433 em->priority_time = rec->priority_time;
1435 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1437 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1441 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1442 em->node_flags = rec->node_flags;
1444 for (i=0;i<nodemap->num;i++) {
1445 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1446 em->num_connected++;
1450 /* we shouldnt try to win this election if we cant be a recmaster */
1451 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1452 em->num_connected = 0;
1453 em->priority_time = timeval_current();
1456 talloc_free(nodemap);
1460 see if the given election data wins
1462 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1464 struct election_message myem;
1467 ctdb_election_data(rec, &myem);
1469 /* we cant win if we don't have the recmaster capability */
1470 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1474 /* we cant win if we are banned */
1475 if (rec->node_flags & NODE_FLAGS_BANNED) {
1479 /* we cant win if we are stopped */
1480 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1484 /* we will automatically win if the other node is banned */
1485 if (em->node_flags & NODE_FLAGS_BANNED) {
1489 /* we will automatically win if the other node is banned */
1490 if (em->node_flags & NODE_FLAGS_STOPPED) {
1494 /* then the longest running node */
1496 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1500 cmp = (int)myem.pnn - (int)em->pnn;
1507 send out an election request
1509 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1512 TDB_DATA election_data;
1513 struct election_message emsg;
1515 struct ctdb_context *ctdb = rec->ctdb;
1517 srvid = CTDB_SRVID_ELECTION;
1519 ctdb_election_data(rec, &emsg);
1521 election_data.dsize = sizeof(struct election_message);
1522 election_data.dptr = (unsigned char *)&emsg;
1525 /* first we assume we will win the election and set
1526 recoverymaster to be ourself on the current node
1528 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1529 CTDB_CURRENT_NODE, pnn);
1531 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1534 rec->recmaster = pnn;
1536 /* send an election message to all active nodes */
1537 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1538 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1542 we think we are winning the election - send a broadcast election request
1544 static void election_send_request(struct tevent_context *ev,
1545 struct tevent_timer *te,
1546 struct timeval t, void *p)
1548 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1551 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1553 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1556 TALLOC_FREE(rec->send_election_te);
1560 handler for memory dumps
1562 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1564 struct ctdb_recoverd *rec = talloc_get_type(
1565 private_data, struct ctdb_recoverd);
1566 struct ctdb_context *ctdb = rec->ctdb;
1567 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1570 struct ctdb_srvid_message *rd;
1572 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1573 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1574 talloc_free(tmp_ctx);
1577 rd = (struct ctdb_srvid_message *)data.dptr;
1579 dump = talloc_zero(tmp_ctx, TDB_DATA);
1581 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1582 talloc_free(tmp_ctx);
1585 ret = ctdb_dump_memory(ctdb, dump);
1587 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1588 talloc_free(tmp_ctx);
1592 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1594 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1596 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1597 talloc_free(tmp_ctx);
1601 talloc_free(tmp_ctx);
1605 handler for reload_nodes
1607 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1610 struct ctdb_recoverd *rec = talloc_get_type(
1611 private_data, struct ctdb_recoverd);
1613 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1615 ctdb_load_nodes_file(rec->ctdb);
1619 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1622 struct ctdb_recoverd *rec = talloc_get_type(
1623 private_data, struct ctdb_recoverd);
1624 struct ctdb_context *ctdb = rec->ctdb;
1629 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1633 if (data.dsize != sizeof(uint32_t)) {
1634 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1638 pnn = *(uint32_t *)&data.dptr[0];
1640 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1642 /* Copy any existing list of nodes. There's probably some
1643 * sort of realloc variant that will do this but we need to
1644 * make sure that freeing the old array also cancels the timer
1645 * event for the timeout... not sure if realloc will do that.
1647 len = (rec->force_rebalance_nodes != NULL) ?
1648 talloc_array_length(rec->force_rebalance_nodes) :
1651 /* This allows duplicates to be added but they don't cause
1652 * harm. A call to add a duplicate PNN arguably means that
1653 * the timeout should be reset, so this is the simplest
1656 t = talloc_zero_array(rec, uint32_t, len+1);
1657 CTDB_NO_MEMORY_VOID(ctdb, t);
1659 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1663 talloc_free(rec->force_rebalance_nodes);
1665 rec->force_rebalance_nodes = t;
1670 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1672 struct ctdb_op_state *op_state)
1674 struct ctdb_disable_message *r;
1679 /* Validate input data */
1680 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1681 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1682 "expecting %lu\n", (long unsigned)data.dsize,
1683 (long unsigned)sizeof(struct ctdb_srvid_message)));
1686 if (data.dptr == NULL) {
1687 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1691 r = (struct ctdb_disable_message *)data.dptr;
1692 timeout = r->timeout;
1694 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1699 /* Returning our PNN tells the caller that we succeeded */
1700 ret = ctdb_get_pnn(ctdb);
1702 result.dsize = sizeof(int32_t);
1703 result.dptr = (uint8_t *)&ret;
1704 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1707 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1710 struct ctdb_recoverd *rec = talloc_get_type(
1711 private_data, struct ctdb_recoverd);
1713 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1716 /* Backward compatibility for this SRVID */
1717 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1720 struct ctdb_recoverd *rec = talloc_get_type(
1721 private_data, struct ctdb_recoverd);
1724 if (data.dsize != sizeof(uint32_t)) {
1725 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1726 "expecting %lu\n", (long unsigned)data.dsize,
1727 (long unsigned)sizeof(uint32_t)));
1730 if (data.dptr == NULL) {
1731 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1735 timeout = *((uint32_t *)data.dptr);
1737 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1740 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1743 struct ctdb_recoverd *rec = talloc_get_type(
1744 private_data, struct ctdb_recoverd);
1746 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1750 handler for ip reallocate, just add it to the list of requests and
1751 handle this later in the monitor_cluster loop so we do not recurse
1752 with other requests to takeover_run()
1754 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1757 struct ctdb_srvid_message *request;
1758 struct ctdb_recoverd *rec = talloc_get_type(
1759 private_data, struct ctdb_recoverd);
1761 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1762 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1766 request = (struct ctdb_srvid_message *)data.dptr;
1768 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1771 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1772 struct ctdb_recoverd *rec)
1776 struct srvid_requests *current;
1778 /* Only process requests that are currently pending. More
1779 * might come in while the takeover run is in progress and
1780 * they will need to be processed later since they might
1781 * be in response flag changes.
1783 current = rec->reallocate_requests;
1784 rec->reallocate_requests = NULL;
1786 if (do_takeover_run(rec, rec->nodemap)) {
1787 ret = ctdb_get_pnn(ctdb);
1792 result.dsize = sizeof(int32_t);
1793 result.dptr = (uint8_t *)&ret;
1795 srvid_requests_reply(ctdb, ¤t, result);
1799 * handler for assigning banning credits
1801 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1803 struct ctdb_recoverd *rec = talloc_get_type(
1804 private_data, struct ctdb_recoverd);
1807 /* Ignore if we are not recmaster */
1808 if (rec->ctdb->pnn != rec->recmaster) {
1812 if (data.dsize != sizeof(uint32_t)) {
1813 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1818 ban_pnn = *(uint32_t *)data.dptr;
1820 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1824 handler for recovery master elections
1826 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1828 struct ctdb_recoverd *rec = talloc_get_type(
1829 private_data, struct ctdb_recoverd);
1830 struct ctdb_context *ctdb = rec->ctdb;
1832 struct election_message *em = (struct election_message *)data.dptr;
1834 /* Ignore election packets from ourself */
1835 if (ctdb->pnn == em->pnn) {
1839 /* we got an election packet - update the timeout for the election */
1840 talloc_free(rec->election_timeout);
1841 rec->election_timeout = tevent_add_timer(
1844 timeval_current_ofs(0, 500000) :
1845 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1846 ctdb_election_timeout, rec);
1848 /* someone called an election. check their election data
1849 and if we disagree and we would rather be the elected node,
1850 send a new election message to all other nodes
1852 if (ctdb_election_win(rec, em)) {
1853 if (!rec->send_election_te) {
1854 rec->send_election_te = tevent_add_timer(
1856 timeval_current_ofs(0, 500000),
1857 election_send_request, rec);
1863 TALLOC_FREE(rec->send_election_te);
1865 /* Release the recovery lock file */
1866 if (ctdb_recovery_have_lock(rec)) {
1867 ctdb_recovery_unlock(rec);
1870 /* ok, let that guy become recmaster then */
1871 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1872 CTDB_CURRENT_NODE, em->pnn);
1874 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1877 rec->recmaster = em->pnn;
1884 force the start of the election process
1886 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1887 struct ctdb_node_map_old *nodemap)
1890 struct ctdb_context *ctdb = rec->ctdb;
1892 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1894 /* set all nodes to recovery mode to stop all internode traffic */
1895 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1897 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1901 talloc_free(rec->election_timeout);
1902 rec->election_timeout = tevent_add_timer(
1905 timeval_current_ofs(0, 500000) :
1906 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1907 ctdb_election_timeout, rec);
1909 ret = send_election_request(rec, pnn);
1911 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1915 /* wait for a few seconds to collect all responses */
1916 ctdb_wait_election(rec);
1922 handler for when a node changes its flags
1924 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1926 struct ctdb_recoverd *rec = talloc_get_type(
1927 private_data, struct ctdb_recoverd);
1928 struct ctdb_context *ctdb = rec->ctdb;
1930 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1931 struct ctdb_node_map_old *nodemap=NULL;
1932 TALLOC_CTX *tmp_ctx;
1935 if (data.dsize != sizeof(*c)) {
1936 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1940 tmp_ctx = talloc_new(ctdb);
1941 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1943 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1945 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1946 talloc_free(tmp_ctx);
1951 for (i=0;i<nodemap->num;i++) {
1952 if (nodemap->nodes[i].pnn == c->pnn) break;
1955 if (i == nodemap->num) {
1956 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1957 talloc_free(tmp_ctx);
1961 if (c->old_flags != c->new_flags) {
1962 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1965 nodemap->nodes[i].flags = c->new_flags;
1967 talloc_free(tmp_ctx);
1971 handler for when we need to push out flag changes ot all other nodes
1973 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1976 struct ctdb_recoverd *rec = talloc_get_type(
1977 private_data, struct ctdb_recoverd);
1978 struct ctdb_context *ctdb = rec->ctdb;
1980 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1981 struct ctdb_node_map_old *nodemap=NULL;
1982 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1985 /* read the node flags from the recmaster */
1986 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1989 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1990 talloc_free(tmp_ctx);
1993 if (c->pnn >= nodemap->num) {
1994 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1995 talloc_free(tmp_ctx);
1999 /* send the flags update to all connected nodes */
2000 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2002 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2003 nodes, 0, CONTROL_TIMEOUT(),
2007 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2009 talloc_free(tmp_ctx);
2013 talloc_free(tmp_ctx);
2017 struct verify_recmode_normal_data {
2019 enum monitor_result status;
2022 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2024 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2027 /* one more node has responded with recmode data*/
2030 /* if we failed to get the recmode, then return an error and let
2031 the main loop try again.
2033 if (state->state != CTDB_CONTROL_DONE) {
2034 if (rmdata->status == MONITOR_OK) {
2035 rmdata->status = MONITOR_FAILED;
2040 /* if we got a response, then the recmode will be stored in the
2043 if (state->status != CTDB_RECOVERY_NORMAL) {
2044 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2045 rmdata->status = MONITOR_RECOVERY_NEEDED;
2052 /* verify that all nodes are in normal recovery mode */
2053 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2055 struct verify_recmode_normal_data *rmdata;
2056 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2057 struct ctdb_client_control_state *state;
2058 enum monitor_result status;
2061 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2062 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2064 rmdata->status = MONITOR_OK;
2066 /* loop over all active nodes and send an async getrecmode call to
2068 for (j=0; j<nodemap->num; j++) {
2069 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2072 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2074 nodemap->nodes[j].pnn);
2075 if (state == NULL) {
2076 /* we failed to send the control, treat this as
2077 an error and try again next iteration
2079 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2080 talloc_free(mem_ctx);
2081 return MONITOR_FAILED;
2084 /* set up the callback functions */
2085 state->async.fn = verify_recmode_normal_callback;
2086 state->async.private_data = rmdata;
2088 /* one more control to wait for to complete */
2093 /* now wait for up to the maximum number of seconds allowed
2094 or until all nodes we expect a response from has replied
2096 while (rmdata->count > 0) {
2097 tevent_loop_once(ctdb->ev);
2100 status = rmdata->status;
2101 talloc_free(mem_ctx);
2106 struct verify_recmaster_data {
2107 struct ctdb_recoverd *rec;
2110 enum monitor_result status;
2113 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2115 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2118 /* one more node has responded with recmaster data*/
2121 /* if we failed to get the recmaster, then return an error and let
2122 the main loop try again.
2124 if (state->state != CTDB_CONTROL_DONE) {
2125 if (rmdata->status == MONITOR_OK) {
2126 rmdata->status = MONITOR_FAILED;
2131 /* if we got a response, then the recmaster will be stored in the
2134 if (state->status != rmdata->pnn) {
2135 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2136 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2137 rmdata->status = MONITOR_ELECTION_NEEDED;
2144 /* verify that all nodes agree that we are the recmaster */
2145 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2147 struct ctdb_context *ctdb = rec->ctdb;
2148 struct verify_recmaster_data *rmdata;
2149 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2150 struct ctdb_client_control_state *state;
2151 enum monitor_result status;
2154 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2155 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2159 rmdata->status = MONITOR_OK;
2161 /* loop over all active nodes and send an async getrecmaster call to
2163 for (j=0; j<nodemap->num; j++) {
2164 if (nodemap->nodes[j].pnn == rec->recmaster) {
2167 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2170 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2172 nodemap->nodes[j].pnn);
2173 if (state == NULL) {
2174 /* we failed to send the control, treat this as
2175 an error and try again next iteration
2177 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2178 talloc_free(mem_ctx);
2179 return MONITOR_FAILED;
2182 /* set up the callback functions */
2183 state->async.fn = verify_recmaster_callback;
2184 state->async.private_data = rmdata;
2186 /* one more control to wait for to complete */
2191 /* now wait for up to the maximum number of seconds allowed
2192 or until all nodes we expect a response from has replied
2194 while (rmdata->count > 0) {
2195 tevent_loop_once(ctdb->ev);
2198 status = rmdata->status;
2199 talloc_free(mem_ctx);
2203 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2204 struct ctdb_recoverd *rec)
2206 struct ctdb_iface_list_old *ifaces = NULL;
2207 TALLOC_CTX *mem_ctx;
2210 mem_ctx = talloc_new(NULL);
2212 /* Read the interfaces from the local node */
2213 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2214 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2215 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2216 /* We could return an error. However, this will be
2217 * rare so we'll decide that the interfaces have
2218 * actually changed, just in case.
2220 talloc_free(mem_ctx);
2225 /* We haven't been here before so things have changed */
2226 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2228 } else if (rec->ifaces->num != ifaces->num) {
2229 /* Number of interfaces has changed */
2230 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2231 rec->ifaces->num, ifaces->num));
2234 /* See if interface names or link states have changed */
2236 for (i = 0; i < rec->ifaces->num; i++) {
2237 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2238 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2240 ("Interface in slot %d changed: %s => %s\n",
2241 i, iface->name, ifaces->ifaces[i].name));
2245 if (iface->link_state != ifaces->ifaces[i].link_state) {
2247 ("Interface %s changed state: %d => %d\n",
2248 iface->name, iface->link_state,
2249 ifaces->ifaces[i].link_state));
2256 talloc_free(rec->ifaces);
2257 rec->ifaces = talloc_steal(rec, ifaces);
2259 talloc_free(mem_ctx);
2263 /* Check that the local allocation of public IP addresses is correct
2264 * and do some house-keeping */
2265 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2266 struct ctdb_recoverd *rec,
2268 struct ctdb_node_map_old *nodemap)
2270 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2272 bool need_takeover_run = false;
2273 struct ctdb_public_ip_list_old *ips = NULL;
2275 /* If we are not the recmaster then do some housekeeping */
2276 if (rec->recmaster != pnn) {
2277 /* Ignore any IP reallocate requests - only recmaster
2280 TALLOC_FREE(rec->reallocate_requests);
2281 /* Clear any nodes that should be force rebalanced in
2282 * the next takeover run. If the recovery master role
2283 * has moved then we don't want to process these some
2284 * time in the future.
2286 TALLOC_FREE(rec->force_rebalance_nodes);
2289 /* Return early if disabled... */
2290 if (ctdb->tunable.disable_ip_failover != 0 ||
2291 ctdb_op_is_disabled(rec->takeover_run)) {
2295 if (interfaces_have_changed(ctdb, rec)) {
2296 need_takeover_run = true;
2299 /* If there are unhosted IPs but this node can host them then
2300 * trigger an IP reallocation */
2302 /* Read *available* IPs from local node */
2303 ret = ctdb_ctrl_get_public_ips_flags(
2304 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2305 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2307 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2308 talloc_free(mem_ctx);
2312 for (j=0; j<ips->num; j++) {
2313 if (ips->ips[j].pnn == -1 &&
2314 nodemap->nodes[pnn].flags == 0) {
2315 DEBUG(DEBUG_WARNING,
2316 ("Unassigned IP %s can be served by this node\n",
2317 ctdb_addr_to_str(&ips->ips[j].addr)));
2318 need_takeover_run = true;
2324 if (!ctdb->do_checkpublicip) {
2328 /* Validate the IP addresses that this node has on network
2329 * interfaces. If there is an inconsistency between reality
2330 * and the state expected by CTDB then try to fix it by
2331 * triggering an IP reallocation or releasing extraneous IP
2334 /* Read *known* IPs from local node */
2335 ret = ctdb_ctrl_get_public_ips_flags(
2336 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2338 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2339 talloc_free(mem_ctx);
2343 for (j=0; j<ips->num; j++) {
2344 if (ips->ips[j].pnn == pnn) {
2345 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2347 ("Assigned IP %s not on an interface\n",
2348 ctdb_addr_to_str(&ips->ips[j].addr)));
2349 need_takeover_run = true;
2352 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2354 ("IP %s incorrectly on an interface\n",
2355 ctdb_addr_to_str(&ips->ips[j].addr)));
2356 need_takeover_run = true;
2362 if (need_takeover_run) {
2363 struct ctdb_srvid_message rd;
2366 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2371 data.dptr = (uint8_t *)&rd;
2372 data.dsize = sizeof(rd);
2374 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2377 ("Failed to send takeover run request\n"));
2380 talloc_free(mem_ctx);
2385 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2387 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2389 if (node_pnn >= ctdb->num_nodes) {
2390 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2394 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2398 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2399 struct ctdb_node_map_old *nodemap,
2400 struct ctdb_node_map_old **remote_nodemaps)
2404 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2405 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2407 CONTROL_TIMEOUT(), false, tdb_null,
2408 async_getnodemap_callback,
2410 remote_nodemaps) != 0) {
2411 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2419 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2420 TALLOC_CTX *mem_ctx)
2422 struct ctdb_context *ctdb = rec->ctdb;
2423 uint32_t pnn = ctdb_get_pnn(ctdb);
2424 struct ctdb_node_map_old *nodemap = rec->nodemap;
2425 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2428 /* When recovery daemon is started, recmaster is set to
2429 * "unknown" so it knows to start an election.
2431 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2433 ("Initial recovery master set - forcing election\n"));
2434 force_election(rec, pnn, nodemap);
2439 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2440 * but we have, then force an election and try to become the new
2443 if (!ctdb_node_has_capabilities(rec->caps,
2445 CTDB_CAP_RECMASTER) &&
2446 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2447 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2449 (" Current recmaster node %u does not have CAP_RECMASTER,"
2450 " but we (node %u) have - force an election\n",
2451 rec->recmaster, pnn));
2452 force_election(rec, pnn, nodemap);
2456 /* Verify that the master node has not been deleted. This
2457 * should not happen because a node should always be shutdown
2458 * before being deleted, causing a new master to be elected
2459 * before now. However, if something strange has happened
2460 * then checking here will ensure we don't index beyond the
2461 * end of the nodemap array. */
2462 if (rec->recmaster >= nodemap->num) {
2464 ("Recmaster node %u has been deleted. Force election\n",
2466 force_election(rec, pnn, nodemap);
2470 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2471 if (nodemap->nodes[rec->recmaster].flags &
2472 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2474 ("Recmaster node %u is disconnected/deleted. Force election\n",
2476 force_election(rec, pnn, nodemap);
2480 /* get nodemap from the recovery master to check if it is inactive */
2481 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2482 mem_ctx, &recmaster_nodemap);
2486 " Unable to get nodemap from recovery master %u\n",
2488 /* No election, just error */
2493 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2494 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2496 ("Recmaster node %u is inactive. Force election\n",
2499 * update our nodemap to carry the recmaster's notion of
2500 * its own flags, so that we don't keep freezing the
2501 * inactive recmaster node...
2503 nodemap->nodes[rec->recmaster].flags =
2504 recmaster_nodemap->nodes[rec->recmaster].flags;
2505 force_election(rec, pnn, nodemap);
2512 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2513 TALLOC_CTX *mem_ctx)
2516 struct ctdb_node_map_old *nodemap=NULL;
2517 struct ctdb_node_map_old **remote_nodemaps=NULL;
2518 struct ctdb_vnn_map *vnnmap=NULL;
2519 struct ctdb_vnn_map *remote_vnnmap=NULL;
2520 uint32_t num_lmasters;
2521 int32_t debug_level;
2526 /* verify that the main daemon is still running */
2527 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2528 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2532 /* ping the local daemon to tell it we are alive */
2533 ctdb_ctrl_recd_ping(ctdb);
2535 if (rec->election_timeout) {
2536 /* an election is in progress */
2540 /* read the debug level from the parent and update locally */
2541 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2543 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2546 DEBUGLEVEL = debug_level;
2548 /* get relevant tunables */
2549 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2551 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2556 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2557 CTDB_CURRENT_NODE, &ctdb->runstate);
2559 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2563 pnn = ctdb_get_pnn(ctdb);
2566 TALLOC_FREE(rec->nodemap);
2567 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2569 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2572 nodemap = rec->nodemap;
2574 /* remember our own node flags */
2575 rec->node_flags = nodemap->nodes[pnn].flags;
2577 ban_misbehaving_nodes(rec, &self_ban);
2579 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2583 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2584 also frozen and that the recmode is set to active.
2586 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2587 /* If this node has become inactive then we want to
2588 * reduce the chances of it taking over the recovery
2589 * master role when it becomes active again. This
2590 * helps to stabilise the recovery master role so that
2591 * it stays on the most stable node.
2593 rec->priority_time = timeval_current();
2595 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2597 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2599 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2600 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2602 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2604 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2609 if (! rec->frozen_on_inactive) {
2610 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2614 (__location__ " Failed to freeze node "
2615 "in STOPPED or BANNED state\n"));
2619 rec->frozen_on_inactive = true;
2622 /* If this node is stopped or banned then it is not the recovery
2623 * master, so don't do anything. This prevents stopped or banned
2624 * node from starting election and sending unnecessary controls.
2629 rec->frozen_on_inactive = false;
2631 /* Retrieve capabilities from all connected nodes */
2632 ret = update_capabilities(rec, nodemap);
2634 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2638 if (! validate_recovery_master(rec, mem_ctx)) {
2642 /* Check if an IP takeover run is needed and trigger one if
2644 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2646 /* if we are not the recmaster then we do not need to check
2647 if recovery is needed
2649 if (pnn != rec->recmaster) {
2654 /* ensure our local copies of flags are right */
2655 ret = update_local_flags(rec, nodemap);
2657 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2661 if (ctdb->num_nodes != nodemap->num) {
2662 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2663 ctdb_load_nodes_file(ctdb);
2667 /* verify that all active nodes agree that we are the recmaster */
2668 switch (verify_recmaster(rec, nodemap, pnn)) {
2669 case MONITOR_RECOVERY_NEEDED:
2670 /* can not happen */
2672 case MONITOR_ELECTION_NEEDED:
2673 force_election(rec, pnn, nodemap);
2677 case MONITOR_FAILED:
2682 /* get the vnnmap */
2683 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2685 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2689 if (rec->need_recovery) {
2690 /* a previous recovery didn't finish */
2691 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2695 /* verify that all active nodes are in normal mode
2696 and not in recovery mode
2698 switch (verify_recmode(ctdb, nodemap)) {
2699 case MONITOR_RECOVERY_NEEDED:
2700 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2702 case MONITOR_FAILED:
2704 case MONITOR_ELECTION_NEEDED:
2705 /* can not happen */
2711 if (ctdb->recovery_lock != NULL) {
2712 /* We must already hold the recovery lock */
2713 if (!ctdb_recovery_have_lock(rec)) {
2714 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2715 ctdb_set_culprit(rec, ctdb->pnn);
2716 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2722 /* If recoveries are disabled then there is no use doing any
2723 * nodemap or flags checks. Recoveries might be disabled due
2724 * to "reloadnodes", so doing these checks might cause an
2725 * unnecessary recovery. */
2726 if (ctdb_op_is_disabled(rec->recovery)) {
2727 goto takeover_run_checks;
2730 /* get the nodemap for all active remote nodes
2732 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2733 if (remote_nodemaps == NULL) {
2734 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2737 for(i=0; i<nodemap->num; i++) {
2738 remote_nodemaps[i] = NULL;
2740 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2741 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2745 /* verify that all other nodes have the same nodemap as we have
2747 for (j=0; j<nodemap->num; j++) {
2748 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2752 if (remote_nodemaps[j] == NULL) {
2753 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2754 ctdb_set_culprit(rec, j);
2759 /* if the nodes disagree on how many nodes there are
2760 then this is a good reason to try recovery
2762 if (remote_nodemaps[j]->num != nodemap->num) {
2763 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2764 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2765 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2766 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2770 /* if the nodes disagree on which nodes exist and are
2771 active, then that is also a good reason to do recovery
2773 for (i=0;i<nodemap->num;i++) {
2774 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2775 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2776 nodemap->nodes[j].pnn, i,
2777 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2778 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2779 do_recovery(rec, mem_ctx, pnn, nodemap,
2787 * Update node flags obtained from each active node. This ensure we have
2788 * up-to-date information for all the nodes.
2790 for (j=0; j<nodemap->num; j++) {
2791 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2794 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2797 for (j=0; j<nodemap->num; j++) {
2798 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2802 /* verify the flags are consistent
2804 for (i=0; i<nodemap->num; i++) {
2805 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2809 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2810 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2811 nodemap->nodes[j].pnn,
2812 nodemap->nodes[i].pnn,
2813 remote_nodemaps[j]->nodes[i].flags,
2814 nodemap->nodes[i].flags));
2816 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2817 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2818 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2819 do_recovery(rec, mem_ctx, pnn, nodemap,
2823 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2824 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2825 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2826 do_recovery(rec, mem_ctx, pnn, nodemap,
2835 /* count how many active nodes there are */
2837 for (i=0; i<nodemap->num; i++) {
2838 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2839 if (ctdb_node_has_capabilities(rec->caps,
2840 ctdb->nodes[i]->pnn,
2841 CTDB_CAP_LMASTER)) {
2848 /* There must be the same number of lmasters in the vnn map as
2849 * there are active nodes with the lmaster capability... or
2852 if (vnnmap->size != num_lmasters) {
2853 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2854 vnnmap->size, num_lmasters));
2855 ctdb_set_culprit(rec, ctdb->pnn);
2856 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2860 /* verify that all active nodes in the nodemap also exist in
2863 for (j=0; j<nodemap->num; j++) {
2864 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2867 if (nodemap->nodes[j].pnn == pnn) {
2871 for (i=0; i<vnnmap->size; i++) {
2872 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2876 if (i == vnnmap->size) {
2877 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2878 nodemap->nodes[j].pnn));
2879 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2880 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2886 /* verify that all other nodes have the same vnnmap
2887 and are from the same generation
2889 for (j=0; j<nodemap->num; j++) {
2890 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2893 if (nodemap->nodes[j].pnn == pnn) {
2897 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2898 mem_ctx, &remote_vnnmap);
2900 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2901 nodemap->nodes[j].pnn));
2905 /* verify the vnnmap generation is the same */
2906 if (vnnmap->generation != remote_vnnmap->generation) {
2907 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2908 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2909 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2910 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2914 /* verify the vnnmap size is the same */
2915 if (vnnmap->size != remote_vnnmap->size) {
2916 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2917 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2918 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2919 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2923 /* verify the vnnmap is the same */
2924 for (i=0;i<vnnmap->size;i++) {
2925 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2926 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2927 nodemap->nodes[j].pnn));
2928 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2929 do_recovery(rec, mem_ctx, pnn, nodemap,
2936 /* FIXME: Add remote public IP checking to ensure that nodes
2937 * have the IP addresses that are allocated to them. */
2939 takeover_run_checks:
2941 /* If there are IP takeover runs requested or the previous one
2942 * failed then perform one and notify the waiters */
2943 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2944 (rec->reallocate_requests || rec->need_takeover_run)) {
2945 process_ipreallocate_requests(ctdb, rec);
2949 static void recd_sig_term_handler(struct tevent_context *ev,
2950 struct tevent_signal *se, int signum,
2951 int count, void *dont_care,
2954 struct ctdb_recoverd *rec = talloc_get_type_abort(
2955 private_data, struct ctdb_recoverd);
2957 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2958 ctdb_recovery_unlock(rec);
2964 the main monitoring loop
2966 static void monitor_cluster(struct ctdb_context *ctdb)
2968 struct tevent_signal *se;
2969 struct ctdb_recoverd *rec;
2971 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2973 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2974 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2977 rec->recmaster = CTDB_UNKNOWN_PNN;
2978 rec->recovery_lock_handle = NULL;
2980 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2981 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2983 rec->recovery = ctdb_op_init(rec, "recoveries");
2984 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2986 rec->priority_time = timeval_current();
2987 rec->frozen_on_inactive = false;
2989 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
2990 recd_sig_term_handler, rec);
2992 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
2996 /* register a message port for sending memory dumps */
2997 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2999 /* when a node is assigned banning credits */
3000 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3001 banning_handler, rec);
3003 /* register a message port for recovery elections */
3004 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3006 /* when nodes are disabled/enabled */
3007 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3009 /* when we are asked to puch out a flag change */
3010 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3012 /* register a message port for vacuum fetch */
3013 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3015 /* register a message port for reloadnodes */
3016 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3018 /* register a message port for performing a takeover run */
3019 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3021 /* register a message port for disabling the ip check for a short while */
3022 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3024 /* register a message port for forcing a rebalance of a node next
3026 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3028 /* Register a message port for disabling takeover runs */
3029 ctdb_client_set_message_handler(ctdb,
3030 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3031 disable_takeover_runs_handler, rec);
3033 /* Register a message port for disabling recoveries */
3034 ctdb_client_set_message_handler(ctdb,
3035 CTDB_SRVID_DISABLE_RECOVERIES,
3036 disable_recoveries_handler, rec);
3038 /* register a message port for detaching database */
3039 ctdb_client_set_message_handler(ctdb,
3040 CTDB_SRVID_DETACH_DATABASE,
3041 detach_database_handler, rec);
3044 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3045 struct timeval start;
3049 DEBUG(DEBUG_CRIT,(__location__
3050 " Failed to create temp context\n"));
3054 start = timeval_current();
3055 main_loop(ctdb, rec, mem_ctx);
3056 talloc_free(mem_ctx);
3058 /* we only check for recovery once every second */
3059 elapsed = timeval_elapsed(&start);
3060 if (elapsed < ctdb->tunable.recover_interval) {
3061 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3068 event handler for when the main ctdbd dies
3070 static void ctdb_recoverd_parent(struct tevent_context *ev,
3071 struct tevent_fd *fde,
3072 uint16_t flags, void *private_data)
3074 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3079 called regularly to verify that the recovery daemon is still running
3081 static void ctdb_check_recd(struct tevent_context *ev,
3082 struct tevent_timer *te,
3083 struct timeval yt, void *p)
3085 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3087 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3088 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3090 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3091 ctdb_restart_recd, ctdb);
3096 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3097 timeval_current_ofs(30, 0),
3098 ctdb_check_recd, ctdb);
3101 static void recd_sig_child_handler(struct tevent_context *ev,
3102 struct tevent_signal *se, int signum,
3103 int count, void *dont_care,
3106 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3111 pid = waitpid(-1, &status, WNOHANG);
3113 if (errno != ECHILD) {
3114 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3119 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3125 startup the recovery daemon as a child of the main ctdb daemon
3127 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3130 struct tevent_signal *se;
3131 struct tevent_fd *fde;
3134 if (pipe(fd) != 0) {
3138 ctdb->recoverd_pid = ctdb_fork(ctdb);
3139 if (ctdb->recoverd_pid == -1) {
3143 if (ctdb->recoverd_pid != 0) {
3144 talloc_free(ctdb->recd_ctx);
3145 ctdb->recd_ctx = talloc_new(ctdb);
3146 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3149 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3150 timeval_current_ofs(30, 0),
3151 ctdb_check_recd, ctdb);
3157 srandom(getpid() ^ time(NULL));
3159 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3164 prctl_set_comment("ctdb_recovered");
3165 if (switch_from_server_to_client(ctdb) != 0) {
3166 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3170 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3172 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3173 ctdb_recoverd_parent, &fd[0]);
3174 tevent_fd_set_auto_close(fde);
3176 /* set up a handler to pick up sigchld */
3177 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3178 recd_sig_child_handler, ctdb);
3180 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3184 monitor_cluster(ctdb);
3186 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3191 shutdown the recovery daemon
3193 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3195 if (ctdb->recoverd_pid == 0) {
3199 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3200 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3202 TALLOC_FREE(ctdb->recd_ctx);
3203 TALLOC_FREE(ctdb->recd_ping_count);
3206 static void ctdb_restart_recd(struct tevent_context *ev,
3207 struct tevent_timer *te,
3208 struct timeval t, void *private_data)
3210 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3212 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3213 ctdb_stop_recoverd(ctdb);
3214 ctdb_start_recoverd(ctdb);