4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
89 if (*requests == NULL) {
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
116 t = talloc_zero(*requests, struct srvid_list);
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
153 state->in_progress = false;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
169 ("Unable to begin - %s are disabled\n", state->name));
173 state->in_progress = true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
213 if (state->in_progress) {
215 ("Unable to disable %s - in progress\n", state->name));
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
237 struct ctdb_banning_state {
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
423 talloc_free(tmp_ctx);
428 * Update flags on all connected nodes
430 static int update_flags_on_all_nodes(struct ctdb_recoverd *rec,
434 struct ctdb_context *ctdb = rec->ctdb;
435 struct timeval timeout = CONTROL_TIMEOUT();
437 struct ctdb_node_map_old *nodemap=NULL;
438 struct ctdb_node_flag_change c;
439 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
444 nodemap = rec->nodemap;
446 for (i = 0; i < nodemap->num; i++) {
447 if (pnn == nodemap->nodes[i].pnn) {
451 if (i >= nodemap->num) {
452 DBG_ERR("Nodemap does not contain node %d\n", pnn);
453 talloc_free(tmp_ctx);
458 c.old_flags = nodemap->nodes[i].flags;
461 data.dsize = sizeof(c);
462 data.dptr = (unsigned char *)&c;
464 /* send the flags update to all connected nodes */
465 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
467 ret = ctdb_client_async_control(ctdb,
468 CTDB_CONTROL_MODIFY_FLAGS,
478 DBG_ERR("Unable to update flags on remote nodes\n");
479 talloc_free(tmp_ctx);
483 talloc_free(tmp_ctx);
488 called when ctdb_wait_timeout should finish
490 static void ctdb_wait_handler(struct tevent_context *ev,
491 struct tevent_timer *te,
492 struct timeval yt, void *p)
494 uint32_t *timed_out = (uint32_t *)p;
499 wait for a given number of seconds
501 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
503 uint32_t timed_out = 0;
504 time_t usecs = (secs - (time_t)secs) * 1000000;
505 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
506 ctdb_wait_handler, &timed_out);
508 tevent_loop_once(ctdb->ev);
513 called when an election times out (ends)
515 static void ctdb_election_timeout(struct tevent_context *ev,
516 struct tevent_timer *te,
517 struct timeval t, void *p)
519 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
520 rec->election_timeout = NULL;
523 D_WARNING("Election period ended, master=%u\n", rec->recmaster);
528 wait for an election to finish. It finished election_timeout seconds after
529 the last election packet is received
531 static void ctdb_wait_election(struct ctdb_recoverd *rec)
533 struct ctdb_context *ctdb = rec->ctdb;
534 while (rec->election_timeout) {
535 tevent_loop_once(ctdb->ev);
540 * Update local flags from all remote connected nodes and push out
541 * flags changes to all nodes. This is only run by the recovery
544 static int update_flags(struct ctdb_recoverd *rec,
545 struct ctdb_node_map_old *nodemap,
546 struct ctdb_node_map_old **remote_nodemaps)
549 struct ctdb_context *ctdb = rec->ctdb;
550 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
552 /* Check flags from remote nodes */
553 for (j=0; j<nodemap->num; j++) {
554 struct ctdb_node_map_old *remote_nodemap=NULL;
555 uint32_t local_flags = nodemap->nodes[j].flags;
556 uint32_t remote_flags;
559 if (local_flags & NODE_FLAGS_DISCONNECTED) {
562 if (nodemap->nodes[j].pnn == ctdb->pnn) {
566 remote_nodemap = remote_nodemaps[j];
567 remote_flags = remote_nodemap->nodes[j].flags;
569 if (local_flags != remote_flags) {
570 ret = update_flags_on_all_nodes(rec,
571 nodemap->nodes[j].pnn,
575 "Unable to update flags on remote nodes\n");
576 talloc_free(mem_ctx);
581 * Update the local copy of the flags in the
584 D_NOTICE("Remote node %u had flags 0x%x, "
585 "local had 0x%x - updating local\n",
586 nodemap->nodes[j].pnn,
589 nodemap->nodes[j].flags = remote_flags;
592 talloc_free(mem_ctx);
597 /* Create a new random generation id.
598 The generation id can not be the INVALID_GENERATION id
600 static uint32_t new_generation(void)
605 generation = random();
607 if (generation != INVALID_GENERATION) {
615 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
617 return (rec->recovery_lock_handle != NULL);
620 struct ctdb_recovery_lock_handle {
624 struct ctdb_cluster_mutex_handle *h;
625 struct ctdb_recoverd *rec;
628 static void take_reclock_handler(char status,
632 struct ctdb_recovery_lock_handle *s =
633 (struct ctdb_recovery_lock_handle *) private_data;
635 s->locked = (status == '0') ;
638 * If unsuccessful then ensure the process has exited and that
639 * the file descriptor event handler has been cancelled
647 s->latency = latency;
651 D_ERR("Unable to take recovery lock - contention\n");
655 D_ERR("Unable to take recovery lock - timeout\n");
659 D_ERR("Unable to take recover lock - unknown error\n");
662 struct ctdb_recoverd *rec = s->rec;
663 struct ctdb_context *ctdb = rec->ctdb;
664 uint32_t pnn = ctdb_get_pnn(ctdb);
666 D_ERR("Banning this node\n");
669 ctdb->tunable.recovery_ban_period);
676 static void force_election(struct ctdb_recoverd *rec,
678 struct ctdb_node_map_old *nodemap);
680 static void lost_reclock_handler(void *private_data)
682 struct ctdb_recoverd *rec = talloc_get_type_abort(
683 private_data, struct ctdb_recoverd);
685 D_ERR("Recovery lock helper terminated, triggering an election\n");
686 TALLOC_FREE(rec->recovery_lock_handle);
688 force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
691 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
693 struct ctdb_context *ctdb = rec->ctdb;
694 struct ctdb_cluster_mutex_handle *h;
695 struct ctdb_recovery_lock_handle *s;
697 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
699 DBG_ERR("Memory allocation error\n");
705 h = ctdb_cluster_mutex(s,
709 take_reclock_handler,
711 lost_reclock_handler,
718 rec->recovery_lock_handle = s;
722 tevent_loop_once(ctdb->ev);
726 TALLOC_FREE(rec->recovery_lock_handle);
730 ctdb_ctrl_report_recd_lock_latency(ctdb,
737 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
739 if (rec->recovery_lock_handle == NULL) {
743 if (! rec->recovery_lock_handle->done) {
745 * Taking of recovery lock still in progress. Free
746 * the cluster mutex handle to release it but leave
747 * the recovery lock handle in place to allow taking
748 * of the lock to fail.
750 D_NOTICE("Cancelling recovery lock\n");
751 TALLOC_FREE(rec->recovery_lock_handle->h);
752 rec->recovery_lock_handle->done = true;
753 rec->recovery_lock_handle->locked = false;
757 D_NOTICE("Releasing recovery lock\n");
758 TALLOC_FREE(rec->recovery_lock_handle);
761 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
763 struct ctdb_context *ctdb = rec->ctdb;
765 struct ctdb_banning_state *ban_state;
768 for (i=0; i<ctdb->num_nodes; i++) {
769 if (ctdb->nodes[i]->ban_state == NULL) {
772 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
773 if (ban_state->count < 2*ctdb->num_nodes) {
777 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
778 ctdb->nodes[i]->pnn, ban_state->count,
779 ctdb->tunable.recovery_ban_period));
780 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
781 ban_state->count = 0;
783 /* Banning ourself? */
784 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
790 struct helper_state {
797 static void helper_handler(struct tevent_context *ev,
798 struct tevent_fd *fde,
799 uint16_t flags, void *private_data)
801 struct helper_state *state = talloc_get_type_abort(
802 private_data, struct helper_state);
805 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
806 if (ret != sizeof(state->result)) {
807 state->result = EPIPE;
813 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
814 const char *prog, const char *arg, const char *type)
816 struct helper_state *state;
817 struct tevent_fd *fde;
820 uint32_t recmaster = rec->recmaster;
822 state = talloc_zero(mem_ctx, struct helper_state);
824 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
830 ret = pipe(state->fd);
833 ("Failed to create pipe for %s helper\n", type));
837 set_close_on_exec(state->fd[0]);
840 args = talloc_array(state, const char *, nargs);
842 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
846 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
847 if (args[0] == NULL) {
848 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
851 args[1] = rec->ctdb->daemon.name;
855 if (args[2] == NULL) {
859 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
860 if (state->pid == -1) {
862 ("Failed to create child for %s helper\n", type));
871 fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
872 TEVENT_FD_READ, helper_handler, state);
876 tevent_fd_set_auto_close(fde);
878 while (!state->done) {
879 tevent_loop_once(rec->ctdb->ev);
881 /* If recmaster changes, we have lost election */
882 if (recmaster != rec->recmaster) {
883 D_ERR("Recmaster changed to %u, aborting %s\n",
884 rec->recmaster, type);
893 if (state->result != 0) {
897 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
902 if (state->fd[0] != -1) {
905 if (state->fd[1] != -1) {
908 if (state->pid != -1) {
909 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
916 static int ctdb_takeover(struct ctdb_recoverd *rec,
917 uint32_t *force_rebalance_nodes)
919 static char prog[PATH_MAX+1] = "";
924 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
925 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
926 "ctdb_takeover_helper")) {
927 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
931 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
932 uint32_t pnn = force_rebalance_nodes[i];
934 arg = talloc_asprintf(rec, "%u", pnn);
936 arg = talloc_asprintf_append(arg, ",%u", pnn);
939 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
944 if (ctdb_config.failover_disabled) {
945 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
947 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
952 return helper_run(rec, rec, prog, arg, "takeover");
955 static bool do_takeover_run(struct ctdb_recoverd *rec,
956 struct ctdb_node_map_old *nodemap)
958 uint32_t *nodes = NULL;
959 struct ctdb_disable_message dtr;
962 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
966 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
968 if (ctdb_op_is_in_progress(rec->takeover_run)) {
969 DEBUG(DEBUG_ERR, (__location__
970 " takeover run already in progress \n"));
975 if (!ctdb_op_begin(rec->takeover_run)) {
980 /* Disable IP checks (takeover runs, really) on other nodes
981 * while doing this takeover run. This will stop those other
982 * nodes from triggering takeover runs when think they should
983 * be hosting an IP but it isn't yet on an interface. Don't
984 * wait for replies since a failure here might cause some
985 * noise in the logs but will not actually cause a problem.
988 dtr.srvid = 0; /* No reply */
991 data.dptr = (uint8_t*)&dtr;
992 data.dsize = sizeof(dtr);
994 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
996 /* Disable for 60 seconds. This can be a tunable later if
1000 for (i = 0; i < talloc_array_length(nodes); i++) {
1001 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1002 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1004 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1008 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1010 /* Reenable takeover runs and IP checks on other nodes */
1012 for (i = 0; i < talloc_array_length(nodes); i++) {
1013 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1014 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1016 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1021 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1027 /* Takeover run was successful so clear force rebalance targets */
1028 if (rebalance_nodes == rec->force_rebalance_nodes) {
1029 TALLOC_FREE(rec->force_rebalance_nodes);
1031 DEBUG(DEBUG_WARNING,
1032 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1035 rec->need_takeover_run = !ok;
1037 ctdb_op_end(rec->takeover_run);
1039 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1043 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1045 static char prog[PATH_MAX+1] = "";
1048 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1049 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1050 "ctdb_recovery_helper")) {
1051 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1054 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1056 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1060 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1062 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1066 we are the recmaster, and recovery is needed - start a recovery run
1068 static int do_recovery(struct ctdb_recoverd *rec,
1069 TALLOC_CTX *mem_ctx, uint32_t pnn,
1070 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1072 struct ctdb_context *ctdb = rec->ctdb;
1077 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1079 /* Check if the current node is still the recmaster. It's possible that
1080 * re-election has changed the recmaster.
1082 if (pnn != rec->recmaster) {
1084 ("Recovery master changed to %u, aborting recovery\n",
1089 /* if recovery fails, force it again */
1090 rec->need_recovery = true;
1092 if (!ctdb_op_begin(rec->recovery)) {
1096 if (rec->election_timeout) {
1097 /* an election is in progress */
1098 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1102 ban_misbehaving_nodes(rec, &self_ban);
1104 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1108 if (ctdb->recovery_lock != NULL) {
1109 if (ctdb_recovery_have_lock(rec)) {
1110 D_NOTICE("Already holding recovery lock\n");
1114 D_NOTICE("Attempting to take recovery lock (%s)\n",
1115 ctdb->recovery_lock);
1117 ok = ctdb_recovery_lock(rec);
1119 D_ERR("Unable to take recovery lock\n");
1121 if (pnn != rec->recmaster) {
1122 D_NOTICE("Recovery master changed to %u,"
1123 " aborting recovery\n",
1125 rec->need_recovery = false;
1129 if (ctdb->runstate ==
1130 CTDB_RUNSTATE_FIRST_RECOVERY) {
1132 * First recovery? Perhaps
1133 * current node does not yet
1134 * know who the recmaster is.
1136 D_ERR("Retrying recovery\n");
1140 D_ERR("Abort recovery, "
1141 "ban this node for %u seconds\n",
1142 ctdb->tunable.recovery_ban_period);
1145 ctdb->tunable.recovery_ban_period);
1148 D_NOTICE("Recovery lock taken successfully\n");
1152 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1154 /* Retrieve capabilities from all connected nodes */
1155 ret = update_capabilities(rec, nodemap);
1157 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1162 update all nodes to have the same flags that we have
1164 for (i=0;i<nodemap->num;i++) {
1165 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1169 ret = update_flags_on_all_nodes(rec,
1170 nodemap->nodes[i].pnn,
1171 nodemap->nodes[i].flags);
1173 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1174 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1176 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1182 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1184 ret = db_recovery_parallel(rec, mem_ctx);
1189 do_takeover_run(rec, nodemap);
1191 /* send a message to all clients telling them that the cluster
1192 has been reconfigured */
1193 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1194 CTDB_SRVID_RECONFIGURE, tdb_null);
1196 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1200 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1202 rec->need_recovery = false;
1203 ctdb_op_end(rec->recovery);
1205 /* we managed to complete a full recovery, make sure to forgive
1206 any past sins by the nodes that could now participate in the
1209 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1210 for (i=0;i<nodemap->num;i++) {
1211 struct ctdb_banning_state *ban_state;
1213 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1217 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1218 if (ban_state == NULL) {
1222 ban_state->count = 0;
1225 /* We just finished a recovery successfully.
1226 We now wait for rerecovery_timeout before we allow
1227 another recovery to take place.
1229 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1230 ctdb_op_disable(rec->recovery, ctdb->ev,
1231 ctdb->tunable.rerecovery_timeout);
1235 ctdb_op_end(rec->recovery);
1241 elections are won by first checking the number of connected nodes, then
1242 the priority time, then the pnn
1244 struct election_message {
1245 uint32_t num_connected;
1246 struct timeval priority_time;
1248 uint32_t node_flags;
1252 form this nodes election data
1254 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1258 struct ctdb_node_map_old *nodemap;
1259 struct ctdb_context *ctdb = rec->ctdb;
1263 em->pnn = rec->ctdb->pnn;
1264 em->priority_time = rec->priority_time;
1266 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1268 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1272 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1273 em->node_flags = rec->node_flags;
1275 for (i=0;i<nodemap->num;i++) {
1276 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1277 em->num_connected++;
1281 /* we shouldnt try to win this election if we cant be a recmaster */
1282 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1283 em->num_connected = 0;
1284 em->priority_time = timeval_current();
1287 talloc_free(nodemap);
1291 see if the given election data wins
1293 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1295 struct election_message myem;
1298 ctdb_election_data(rec, &myem);
1300 /* we cant win if we don't have the recmaster capability */
1301 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1305 /* we cant win if we are banned */
1306 if (rec->node_flags & NODE_FLAGS_BANNED) {
1310 /* we cant win if we are stopped */
1311 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1315 /* we will automatically win if the other node is banned */
1316 if (em->node_flags & NODE_FLAGS_BANNED) {
1320 /* we will automatically win if the other node is banned */
1321 if (em->node_flags & NODE_FLAGS_STOPPED) {
1325 /* then the longest running node */
1327 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1331 cmp = (int)myem.pnn - (int)em->pnn;
1338 send out an election request
1340 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1343 TDB_DATA election_data;
1344 struct election_message emsg;
1346 struct ctdb_context *ctdb = rec->ctdb;
1348 srvid = CTDB_SRVID_ELECTION;
1350 ctdb_election_data(rec, &emsg);
1352 election_data.dsize = sizeof(struct election_message);
1353 election_data.dptr = (unsigned char *)&emsg;
1356 /* first we assume we will win the election and set
1357 recoverymaster to be ourself on the current node
1359 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1360 CTDB_CURRENT_NODE, pnn);
1362 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1365 rec->recmaster = pnn;
1367 /* send an election message to all active nodes */
1368 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1369 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1373 we think we are winning the election - send a broadcast election request
1375 static void election_send_request(struct tevent_context *ev,
1376 struct tevent_timer *te,
1377 struct timeval t, void *p)
1379 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1382 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1384 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1387 TALLOC_FREE(rec->send_election_te);
1391 handler for memory dumps
1393 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1395 struct ctdb_recoverd *rec = talloc_get_type(
1396 private_data, struct ctdb_recoverd);
1397 struct ctdb_context *ctdb = rec->ctdb;
1398 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1401 struct ctdb_srvid_message *rd;
1403 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1404 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1405 talloc_free(tmp_ctx);
1408 rd = (struct ctdb_srvid_message *)data.dptr;
1410 dump = talloc_zero(tmp_ctx, TDB_DATA);
1412 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1413 talloc_free(tmp_ctx);
1416 ret = ctdb_dump_memory(ctdb, dump);
1418 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1419 talloc_free(tmp_ctx);
1423 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1425 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1427 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1428 talloc_free(tmp_ctx);
1432 talloc_free(tmp_ctx);
1436 handler for reload_nodes
1438 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1441 struct ctdb_recoverd *rec = talloc_get_type(
1442 private_data, struct ctdb_recoverd);
1444 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1446 ctdb_load_nodes_file(rec->ctdb);
1450 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1453 struct ctdb_recoverd *rec = talloc_get_type(
1454 private_data, struct ctdb_recoverd);
1455 struct ctdb_context *ctdb = rec->ctdb;
1460 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1464 if (data.dsize != sizeof(uint32_t)) {
1465 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1469 pnn = *(uint32_t *)&data.dptr[0];
1471 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1473 /* Copy any existing list of nodes. There's probably some
1474 * sort of realloc variant that will do this but we need to
1475 * make sure that freeing the old array also cancels the timer
1476 * event for the timeout... not sure if realloc will do that.
1478 len = (rec->force_rebalance_nodes != NULL) ?
1479 talloc_array_length(rec->force_rebalance_nodes) :
1482 /* This allows duplicates to be added but they don't cause
1483 * harm. A call to add a duplicate PNN arguably means that
1484 * the timeout should be reset, so this is the simplest
1487 t = talloc_zero_array(rec, uint32_t, len+1);
1488 CTDB_NO_MEMORY_VOID(ctdb, t);
1490 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1494 talloc_free(rec->force_rebalance_nodes);
1496 rec->force_rebalance_nodes = t;
1501 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1503 struct ctdb_op_state *op_state)
1505 struct ctdb_disable_message *r;
1510 /* Validate input data */
1511 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1512 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1513 "expecting %lu\n", (long unsigned)data.dsize,
1514 (long unsigned)sizeof(struct ctdb_srvid_message)));
1517 if (data.dptr == NULL) {
1518 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1522 r = (struct ctdb_disable_message *)data.dptr;
1523 timeout = r->timeout;
1525 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1530 /* Returning our PNN tells the caller that we succeeded */
1531 ret = ctdb_get_pnn(ctdb);
1533 result.dsize = sizeof(int32_t);
1534 result.dptr = (uint8_t *)&ret;
1535 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1538 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1541 struct ctdb_recoverd *rec = talloc_get_type(
1542 private_data, struct ctdb_recoverd);
1544 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1547 /* Backward compatibility for this SRVID */
1548 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1551 struct ctdb_recoverd *rec = talloc_get_type(
1552 private_data, struct ctdb_recoverd);
1555 if (data.dsize != sizeof(uint32_t)) {
1556 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1557 "expecting %lu\n", (long unsigned)data.dsize,
1558 (long unsigned)sizeof(uint32_t)));
1561 if (data.dptr == NULL) {
1562 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1566 timeout = *((uint32_t *)data.dptr);
1568 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1571 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1574 struct ctdb_recoverd *rec = talloc_get_type(
1575 private_data, struct ctdb_recoverd);
1577 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1581 handler for ip reallocate, just add it to the list of requests and
1582 handle this later in the monitor_cluster loop so we do not recurse
1583 with other requests to takeover_run()
1585 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1588 struct ctdb_srvid_message *request;
1589 struct ctdb_recoverd *rec = talloc_get_type(
1590 private_data, struct ctdb_recoverd);
1592 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1593 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1597 request = (struct ctdb_srvid_message *)data.dptr;
1599 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1602 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1603 struct ctdb_recoverd *rec)
1607 struct srvid_requests *current;
1609 /* Only process requests that are currently pending. More
1610 * might come in while the takeover run is in progress and
1611 * they will need to be processed later since they might
1612 * be in response flag changes.
1614 current = rec->reallocate_requests;
1615 rec->reallocate_requests = NULL;
1617 if (do_takeover_run(rec, rec->nodemap)) {
1618 ret = ctdb_get_pnn(ctdb);
1623 result.dsize = sizeof(int32_t);
1624 result.dptr = (uint8_t *)&ret;
1626 srvid_requests_reply(ctdb, ¤t, result);
1630 * handler for assigning banning credits
1632 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1634 struct ctdb_recoverd *rec = talloc_get_type(
1635 private_data, struct ctdb_recoverd);
1638 /* Ignore if we are not recmaster */
1639 if (rec->ctdb->pnn != rec->recmaster) {
1643 if (data.dsize != sizeof(uint32_t)) {
1644 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1649 ban_pnn = *(uint32_t *)data.dptr;
1651 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1655 handler for recovery master elections
1657 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1659 struct ctdb_recoverd *rec = talloc_get_type(
1660 private_data, struct ctdb_recoverd);
1661 struct ctdb_context *ctdb = rec->ctdb;
1663 struct election_message *em = (struct election_message *)data.dptr;
1665 /* Ignore election packets from ourself */
1666 if (ctdb->pnn == em->pnn) {
1670 /* we got an election packet - update the timeout for the election */
1671 talloc_free(rec->election_timeout);
1672 rec->election_timeout = tevent_add_timer(
1675 timeval_current_ofs(0, 500000) :
1676 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1677 ctdb_election_timeout, rec);
1679 /* someone called an election. check their election data
1680 and if we disagree and we would rather be the elected node,
1681 send a new election message to all other nodes
1683 if (ctdb_election_win(rec, em)) {
1684 if (!rec->send_election_te) {
1685 rec->send_election_te = tevent_add_timer(
1687 timeval_current_ofs(0, 500000),
1688 election_send_request, rec);
1694 TALLOC_FREE(rec->send_election_te);
1696 /* Release the recovery lock file */
1697 if (ctdb_recovery_have_lock(rec)) {
1698 ctdb_recovery_unlock(rec);
1701 /* ok, let that guy become recmaster then */
1702 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1703 CTDB_CURRENT_NODE, em->pnn);
1705 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1708 rec->recmaster = em->pnn;
1715 force the start of the election process
1717 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1718 struct ctdb_node_map_old *nodemap)
1721 struct ctdb_context *ctdb = rec->ctdb;
1723 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1725 /* set all nodes to recovery mode to stop all internode traffic */
1726 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1728 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1732 talloc_free(rec->election_timeout);
1733 rec->election_timeout = tevent_add_timer(
1736 timeval_current_ofs(0, 500000) :
1737 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1738 ctdb_election_timeout, rec);
1740 ret = send_election_request(rec, pnn);
1742 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1746 /* wait for a few seconds to collect all responses */
1747 ctdb_wait_election(rec);
1753 handler for when a node changes its flags
1755 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1757 struct ctdb_recoverd *rec = talloc_get_type(
1758 private_data, struct ctdb_recoverd);
1759 struct ctdb_context *ctdb = rec->ctdb;
1761 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1762 struct ctdb_node_map_old *nodemap=NULL;
1763 TALLOC_CTX *tmp_ctx;
1766 if (data.dsize != sizeof(*c)) {
1767 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1771 tmp_ctx = talloc_new(ctdb);
1772 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1774 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1776 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1777 talloc_free(tmp_ctx);
1782 for (i=0;i<nodemap->num;i++) {
1783 if (nodemap->nodes[i].pnn == c->pnn) break;
1786 if (i == nodemap->num) {
1787 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1788 talloc_free(tmp_ctx);
1792 if (c->old_flags != c->new_flags) {
1793 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1796 nodemap->nodes[i].flags = c->new_flags;
1798 talloc_free(tmp_ctx);
1802 handler for when we need to push out flag changes to all other nodes
1804 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1807 struct ctdb_recoverd *rec = talloc_get_type(
1808 private_data, struct ctdb_recoverd);
1809 struct ctdb_context *ctdb = rec->ctdb;
1811 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1812 struct ctdb_node_map_old *nodemap=NULL;
1813 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1816 /* read the node flags from the recmaster */
1817 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1820 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1821 talloc_free(tmp_ctx);
1824 if (c->pnn >= nodemap->num) {
1825 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1826 talloc_free(tmp_ctx);
1830 /* send the flags update to all connected nodes */
1831 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1833 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1834 nodes, 0, CONTROL_TIMEOUT(),
1838 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1840 talloc_free(tmp_ctx);
1844 talloc_free(tmp_ctx);
1848 struct verify_recmode_normal_data {
1850 enum monitor_result status;
1853 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1855 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1858 /* one more node has responded with recmode data*/
1861 /* if we failed to get the recmode, then return an error and let
1862 the main loop try again.
1864 if (state->state != CTDB_CONTROL_DONE) {
1865 if (rmdata->status == MONITOR_OK) {
1866 rmdata->status = MONITOR_FAILED;
1871 /* if we got a response, then the recmode will be stored in the
1874 if (state->status != CTDB_RECOVERY_NORMAL) {
1875 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
1876 rmdata->status = MONITOR_RECOVERY_NEEDED;
1883 /* verify that all nodes are in normal recovery mode */
1884 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
1886 struct verify_recmode_normal_data *rmdata;
1887 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1888 struct ctdb_client_control_state *state;
1889 enum monitor_result status;
1892 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1893 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1895 rmdata->status = MONITOR_OK;
1897 /* loop over all active nodes and send an async getrecmode call to
1899 for (j=0; j<nodemap->num; j++) {
1900 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1903 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1905 nodemap->nodes[j].pnn);
1906 if (state == NULL) {
1907 /* we failed to send the control, treat this as
1908 an error and try again next iteration
1910 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1911 talloc_free(mem_ctx);
1912 return MONITOR_FAILED;
1915 /* set up the callback functions */
1916 state->async.fn = verify_recmode_normal_callback;
1917 state->async.private_data = rmdata;
1919 /* one more control to wait for to complete */
1924 /* now wait for up to the maximum number of seconds allowed
1925 or until all nodes we expect a response from has replied
1927 while (rmdata->count > 0) {
1928 tevent_loop_once(ctdb->ev);
1931 status = rmdata->status;
1932 talloc_free(mem_ctx);
1937 struct verify_recmaster_data {
1938 struct ctdb_recoverd *rec;
1941 enum monitor_result status;
1944 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1946 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1949 /* one more node has responded with recmaster data*/
1952 /* if we failed to get the recmaster, then return an error and let
1953 the main loop try again.
1955 if (state->state != CTDB_CONTROL_DONE) {
1956 if (rmdata->status == MONITOR_OK) {
1957 rmdata->status = MONITOR_FAILED;
1962 /* if we got a response, then the recmaster will be stored in the
1965 if ((uint32_t)state->status != rmdata->pnn) {
1966 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
1967 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
1968 rmdata->status = MONITOR_ELECTION_NEEDED;
1975 /* verify that all nodes agree that we are the recmaster */
1976 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
1978 struct ctdb_context *ctdb = rec->ctdb;
1979 struct verify_recmaster_data *rmdata;
1980 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1981 struct ctdb_client_control_state *state;
1982 enum monitor_result status;
1985 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1986 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1990 rmdata->status = MONITOR_OK;
1992 /* loop over all active nodes and send an async getrecmaster call to
1994 for (j=0; j<nodemap->num; j++) {
1995 if (nodemap->nodes[j].pnn == rec->recmaster) {
1998 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2001 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2003 nodemap->nodes[j].pnn);
2004 if (state == NULL) {
2005 /* we failed to send the control, treat this as
2006 an error and try again next iteration
2008 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2009 talloc_free(mem_ctx);
2010 return MONITOR_FAILED;
2013 /* set up the callback functions */
2014 state->async.fn = verify_recmaster_callback;
2015 state->async.private_data = rmdata;
2017 /* one more control to wait for to complete */
2022 /* now wait for up to the maximum number of seconds allowed
2023 or until all nodes we expect a response from has replied
2025 while (rmdata->count > 0) {
2026 tevent_loop_once(ctdb->ev);
2029 status = rmdata->status;
2030 talloc_free(mem_ctx);
2034 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2035 struct ctdb_recoverd *rec)
2037 struct ctdb_iface_list_old *ifaces = NULL;
2038 TALLOC_CTX *mem_ctx;
2041 mem_ctx = talloc_new(NULL);
2043 /* Read the interfaces from the local node */
2044 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2045 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2046 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2047 /* We could return an error. However, this will be
2048 * rare so we'll decide that the interfaces have
2049 * actually changed, just in case.
2051 talloc_free(mem_ctx);
2056 /* We haven't been here before so things have changed */
2057 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2059 } else if (rec->ifaces->num != ifaces->num) {
2060 /* Number of interfaces has changed */
2061 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2062 rec->ifaces->num, ifaces->num));
2065 /* See if interface names or link states have changed */
2067 for (i = 0; i < rec->ifaces->num; i++) {
2068 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2069 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2071 ("Interface in slot %d changed: %s => %s\n",
2072 i, iface->name, ifaces->ifaces[i].name));
2076 if (iface->link_state != ifaces->ifaces[i].link_state) {
2078 ("Interface %s changed state: %d => %d\n",
2079 iface->name, iface->link_state,
2080 ifaces->ifaces[i].link_state));
2087 talloc_free(rec->ifaces);
2088 rec->ifaces = talloc_steal(rec, ifaces);
2090 talloc_free(mem_ctx);
2094 /* Check that the local allocation of public IP addresses is correct
2095 * and do some house-keeping */
2096 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2097 struct ctdb_recoverd *rec,
2099 struct ctdb_node_map_old *nodemap)
2101 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2104 bool need_takeover_run = false;
2105 struct ctdb_public_ip_list_old *ips = NULL;
2107 /* If we are not the recmaster then do some housekeeping */
2108 if (rec->recmaster != pnn) {
2109 /* Ignore any IP reallocate requests - only recmaster
2112 TALLOC_FREE(rec->reallocate_requests);
2113 /* Clear any nodes that should be force rebalanced in
2114 * the next takeover run. If the recovery master role
2115 * has moved then we don't want to process these some
2116 * time in the future.
2118 TALLOC_FREE(rec->force_rebalance_nodes);
2121 /* Return early if disabled... */
2122 if (ctdb_config.failover_disabled ||
2123 ctdb_op_is_disabled(rec->takeover_run)) {
2124 talloc_free(mem_ctx);
2128 if (interfaces_have_changed(ctdb, rec)) {
2129 need_takeover_run = true;
2132 /* If there are unhosted IPs but this node can host them then
2133 * trigger an IP reallocation */
2135 /* Read *available* IPs from local node */
2136 ret = ctdb_ctrl_get_public_ips_flags(
2137 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2138 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2140 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2141 talloc_free(mem_ctx);
2145 for (j=0; j<ips->num; j++) {
2146 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2147 nodemap->nodes[pnn].flags == 0) {
2148 DEBUG(DEBUG_WARNING,
2149 ("Unassigned IP %s can be served by this node\n",
2150 ctdb_addr_to_str(&ips->ips[j].addr)));
2151 need_takeover_run = true;
2157 if (!ctdb->do_checkpublicip) {
2161 /* Validate the IP addresses that this node has on network
2162 * interfaces. If there is an inconsistency between reality
2163 * and the state expected by CTDB then try to fix it by
2164 * triggering an IP reallocation or releasing extraneous IP
2167 /* Read *known* IPs from local node */
2168 ret = ctdb_ctrl_get_public_ips_flags(
2169 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2171 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2172 talloc_free(mem_ctx);
2176 for (j=0; j<ips->num; j++) {
2177 if (ips->ips[j].pnn == pnn) {
2178 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2180 ("Assigned IP %s not on an interface\n",
2181 ctdb_addr_to_str(&ips->ips[j].addr)));
2182 need_takeover_run = true;
2185 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2187 ("IP %s incorrectly on an interface\n",
2188 ctdb_addr_to_str(&ips->ips[j].addr)));
2189 need_takeover_run = true;
2195 if (need_takeover_run) {
2196 struct ctdb_srvid_message rd;
2199 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2204 data.dptr = (uint8_t *)&rd;
2205 data.dsize = sizeof(rd);
2207 ret = ctdb_client_send_message(ctdb,
2208 CTDB_BROADCAST_CONNECTED,
2209 CTDB_SRVID_TAKEOVER_RUN,
2212 D_ERR("Failed to send takeover run request\n");
2215 talloc_free(mem_ctx);
2220 struct remote_nodemaps_state {
2221 struct ctdb_node_map_old **remote_nodemaps;
2222 struct ctdb_recoverd *rec;
2225 static void async_getnodemap_callback(struct ctdb_context *ctdb,
2229 void *callback_data)
2231 struct remote_nodemaps_state *state =
2232 (struct remote_nodemaps_state *)callback_data;
2233 struct ctdb_node_map_old **remote_nodemaps = state->remote_nodemaps;
2234 struct ctdb_node_map_old *nodemap = state->rec->nodemap;
2237 for (i = 0; i < nodemap->num; i++) {
2238 if (nodemap->nodes[i].pnn == node_pnn) {
2243 if (i >= nodemap->num) {
2244 DBG_ERR("Invalid PNN %"PRIu32"\n", node_pnn);
2248 remote_nodemaps[i] = (struct ctdb_node_map_old *)talloc_steal(
2249 remote_nodemaps, outdata.dptr);
2253 static void async_getnodemap_error(struct ctdb_context *ctdb,
2257 void *callback_data)
2259 struct remote_nodemaps_state *state =
2260 (struct remote_nodemaps_state *)callback_data;
2261 struct ctdb_recoverd *rec = state->rec;
2263 DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn);
2264 ctdb_set_culprit(rec, node_pnn);
2267 static int get_remote_nodemaps(struct ctdb_recoverd *rec,
2268 TALLOC_CTX *mem_ctx,
2269 struct ctdb_node_map_old ***remote_nodemaps)
2271 struct ctdb_context *ctdb = rec->ctdb;
2272 struct ctdb_node_map_old **t;
2274 struct remote_nodemaps_state state;
2277 t = talloc_zero_array(mem_ctx,
2278 struct ctdb_node_map_old *,
2281 DBG_ERR("Memory allocation error\n");
2285 nodes = list_of_connected_nodes(ctdb, rec->nodemap, mem_ctx, false);
2287 state.remote_nodemaps = t;
2290 ret = ctdb_client_async_control(ctdb,
2291 CTDB_CONTROL_GET_NODEMAP,
2297 async_getnodemap_callback,
2298 async_getnodemap_error,
2307 *remote_nodemaps = t;
2311 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2312 TALLOC_CTX *mem_ctx)
2314 struct ctdb_context *ctdb = rec->ctdb;
2315 uint32_t pnn = ctdb_get_pnn(ctdb);
2316 struct ctdb_node_map_old *nodemap = rec->nodemap;
2317 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2320 /* When recovery daemon is started, recmaster is set to
2321 * "unknown" so it knows to start an election.
2323 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2325 ("Initial recovery master set - forcing election\n"));
2326 force_election(rec, pnn, nodemap);
2331 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2332 * but we have, then force an election and try to become the new
2335 if (!ctdb_node_has_capabilities(rec->caps,
2337 CTDB_CAP_RECMASTER) &&
2338 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2339 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2341 (" Current recmaster node %u does not have CAP_RECMASTER,"
2342 " but we (node %u) have - force an election\n",
2343 rec->recmaster, pnn));
2344 force_election(rec, pnn, nodemap);
2348 /* Verify that the master node has not been deleted. This
2349 * should not happen because a node should always be shutdown
2350 * before being deleted, causing a new master to be elected
2351 * before now. However, if something strange has happened
2352 * then checking here will ensure we don't index beyond the
2353 * end of the nodemap array. */
2354 if (rec->recmaster >= nodemap->num) {
2356 ("Recmaster node %u has been deleted. Force election\n",
2358 force_election(rec, pnn, nodemap);
2362 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2363 if (nodemap->nodes[rec->recmaster].flags &
2364 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2366 ("Recmaster node %u is disconnected/deleted. Force election\n",
2368 force_election(rec, pnn, nodemap);
2372 /* get nodemap from the recovery master to check if it is inactive */
2373 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2374 mem_ctx, &recmaster_nodemap);
2378 " Unable to get nodemap from recovery master %u\n",
2380 /* No election, just error */
2385 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2386 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2388 ("Recmaster node %u is inactive. Force election\n",
2391 * update our nodemap to carry the recmaster's notion of
2392 * its own flags, so that we don't keep freezing the
2393 * inactive recmaster node...
2395 nodemap->nodes[rec->recmaster].flags =
2396 recmaster_nodemap->nodes[rec->recmaster].flags;
2397 force_election(rec, pnn, nodemap);
2404 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2405 TALLOC_CTX *mem_ctx)
2408 struct ctdb_node_map_old *nodemap=NULL;
2409 struct ctdb_node_map_old **remote_nodemaps=NULL;
2410 struct ctdb_vnn_map *vnnmap=NULL;
2411 struct ctdb_vnn_map *remote_vnnmap=NULL;
2412 uint32_t num_lmasters;
2413 int32_t debug_level;
2419 /* verify that the main daemon is still running */
2420 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2421 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2425 /* ping the local daemon to tell it we are alive */
2426 ctdb_ctrl_recd_ping(ctdb);
2428 if (rec->election_timeout) {
2429 /* an election is in progress */
2433 /* read the debug level from the parent and update locally */
2434 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2436 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2439 debuglevel_set(debug_level);
2441 /* get relevant tunables */
2442 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2444 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2449 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2450 CTDB_CURRENT_NODE, &ctdb->runstate);
2452 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2456 pnn = ctdb_get_pnn(ctdb);
2459 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &nodemap);
2461 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", pnn);
2464 talloc_free(rec->nodemap);
2465 rec->nodemap = nodemap;
2467 /* remember our own node flags */
2468 rec->node_flags = nodemap->nodes[pnn].flags;
2470 ban_misbehaving_nodes(rec, &self_ban);
2472 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2476 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2477 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2479 D_ERR("Failed to read recmode from local node\n");
2483 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2484 also frozen and that the recmode is set to active.
2486 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2487 /* If this node has become inactive then we want to
2488 * reduce the chances of it taking over the recovery
2489 * master role when it becomes active again. This
2490 * helps to stabilise the recovery master role so that
2491 * it stays on the most stable node.
2493 rec->priority_time = timeval_current();
2495 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2496 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2498 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2500 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2505 if (! rec->frozen_on_inactive) {
2506 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2510 (__location__ " Failed to freeze node "
2511 "in STOPPED or BANNED state\n"));
2515 rec->frozen_on_inactive = true;
2518 /* If this node is stopped or banned then it is not the recovery
2519 * master, so don't do anything. This prevents stopped or banned
2520 * node from starting election and sending unnecessary controls.
2525 rec->frozen_on_inactive = false;
2527 /* Retrieve capabilities from all connected nodes */
2528 ret = update_capabilities(rec, nodemap);
2530 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2534 if (! validate_recovery_master(rec, mem_ctx)) {
2538 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2539 /* Check if an IP takeover run is needed and trigger one if
2541 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2544 /* if we are not the recmaster then we do not need to check
2545 if recovery is needed
2547 if (pnn != rec->recmaster) {
2552 /* Get the nodemaps for all connected remote nodes */
2553 ret = get_remote_nodemaps(rec, mem_ctx, &remote_nodemaps);
2555 DBG_ERR("Failed to read remote nodemaps\n");
2559 /* Ensure our local and remote flags are correct */
2560 ret = update_flags(rec, nodemap, remote_nodemaps);
2562 D_ERR("Unable to update flags\n");
2566 if (ctdb->num_nodes != nodemap->num) {
2567 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2568 ctdb_load_nodes_file(ctdb);
2572 /* verify that all active nodes agree that we are the recmaster */
2573 switch (verify_recmaster(rec, nodemap, pnn)) {
2574 case MONITOR_RECOVERY_NEEDED:
2575 /* can not happen */
2577 case MONITOR_ELECTION_NEEDED:
2578 force_election(rec, pnn, nodemap);
2582 case MONITOR_FAILED:
2587 /* get the vnnmap */
2588 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2590 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2594 if (rec->need_recovery) {
2595 /* a previous recovery didn't finish */
2596 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2600 /* verify that all active nodes are in normal mode
2601 and not in recovery mode
2603 switch (verify_recmode(ctdb, nodemap)) {
2604 case MONITOR_RECOVERY_NEEDED:
2605 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2607 case MONITOR_FAILED:
2609 case MONITOR_ELECTION_NEEDED:
2610 /* can not happen */
2616 if (ctdb->recovery_lock != NULL) {
2617 /* We must already hold the recovery lock */
2618 if (!ctdb_recovery_have_lock(rec)) {
2619 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2620 ctdb_set_culprit(rec, ctdb->pnn);
2621 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2627 /* If recoveries are disabled then there is no use doing any
2628 * nodemap or flags checks. Recoveries might be disabled due
2629 * to "reloadnodes", so doing these checks might cause an
2630 * unnecessary recovery. */
2631 if (ctdb_op_is_disabled(rec->recovery)) {
2632 goto takeover_run_checks;
2635 /* verify that all other nodes have the same nodemap as we have
2637 for (j=0; j<nodemap->num; j++) {
2638 if (nodemap->nodes[j].pnn == ctdb->pnn) {
2641 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2645 /* if the nodes disagree on how many nodes there are
2646 then this is a good reason to try recovery
2648 if (remote_nodemaps[j]->num != nodemap->num) {
2649 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2650 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2651 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2652 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2656 /* if the nodes disagree on which nodes exist and are
2657 active, then that is also a good reason to do recovery
2659 for (i=0;i<nodemap->num;i++) {
2660 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2661 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2662 nodemap->nodes[j].pnn, i,
2663 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2664 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2665 do_recovery(rec, mem_ctx, pnn, nodemap,
2673 * Update node flags obtained from each active node. This ensure we have
2674 * up-to-date information for all the nodes.
2676 for (j=0; j<nodemap->num; j++) {
2677 if (nodemap->nodes[j].pnn == ctdb->pnn) {
2680 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2683 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2686 for (j=0; j<nodemap->num; j++) {
2687 if (nodemap->nodes[j].pnn == ctdb->pnn) {
2690 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2694 /* verify the flags are consistent
2696 for (i=0; i<nodemap->num; i++) {
2697 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2701 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2702 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2703 nodemap->nodes[j].pnn,
2704 nodemap->nodes[i].pnn,
2705 remote_nodemaps[j]->nodes[i].flags,
2706 nodemap->nodes[i].flags));
2708 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2709 update_flags_on_all_nodes(
2711 nodemap->nodes[i].pnn,
2712 remote_nodemaps[j]->nodes[i].flags);
2713 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2714 do_recovery(rec, mem_ctx, pnn, nodemap,
2718 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2719 update_flags_on_all_nodes(
2721 nodemap->nodes[i].pnn,
2722 nodemap->nodes[i].flags);
2723 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2724 do_recovery(rec, mem_ctx, pnn, nodemap,
2733 /* count how many active nodes there are */
2735 for (i=0; i<nodemap->num; i++) {
2736 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2737 if (ctdb_node_has_capabilities(rec->caps,
2738 ctdb->nodes[i]->pnn,
2739 CTDB_CAP_LMASTER)) {
2746 /* There must be the same number of lmasters in the vnn map as
2747 * there are active nodes with the lmaster capability... or
2750 if (vnnmap->size != num_lmasters) {
2751 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2752 vnnmap->size, num_lmasters));
2753 ctdb_set_culprit(rec, ctdb->pnn);
2754 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2759 * Verify that all active lmaster nodes in the nodemap also
2760 * exist in the vnnmap
2762 for (j=0; j<nodemap->num; j++) {
2763 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2766 if (! ctdb_node_has_capabilities(rec->caps,
2767 nodemap->nodes[j].pnn,
2768 CTDB_CAP_LMASTER)) {
2771 if (nodemap->nodes[j].pnn == pnn) {
2775 for (i=0; i<vnnmap->size; i++) {
2776 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2780 if (i == vnnmap->size) {
2781 D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2782 nodemap->nodes[j].pnn);
2783 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2784 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2790 /* verify that all other nodes have the same vnnmap
2791 and are from the same generation
2793 for (j=0; j<nodemap->num; j++) {
2794 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2797 if (nodemap->nodes[j].pnn == pnn) {
2801 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2802 mem_ctx, &remote_vnnmap);
2804 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2805 nodemap->nodes[j].pnn));
2809 /* verify the vnnmap generation is the same */
2810 if (vnnmap->generation != remote_vnnmap->generation) {
2811 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2812 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2813 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2814 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2818 /* verify the vnnmap size is the same */
2819 if (vnnmap->size != remote_vnnmap->size) {
2820 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2821 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2822 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2823 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2827 /* verify the vnnmap is the same */
2828 for (i=0;i<vnnmap->size;i++) {
2829 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2830 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2831 nodemap->nodes[j].pnn));
2832 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2833 do_recovery(rec, mem_ctx, pnn, nodemap,
2840 /* FIXME: Add remote public IP checking to ensure that nodes
2841 * have the IP addresses that are allocated to them. */
2843 takeover_run_checks:
2845 /* If there are IP takeover runs requested or the previous one
2846 * failed then perform one and notify the waiters */
2847 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2848 (rec->reallocate_requests || rec->need_takeover_run)) {
2849 process_ipreallocate_requests(ctdb, rec);
2853 static void recd_sig_term_handler(struct tevent_context *ev,
2854 struct tevent_signal *se, int signum,
2855 int count, void *dont_care,
2858 struct ctdb_recoverd *rec = talloc_get_type_abort(
2859 private_data, struct ctdb_recoverd);
2861 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2862 ctdb_recovery_unlock(rec);
2867 * Periodically log elements of the cluster state
2869 * This can be used to confirm a split brain has occurred
2871 static void maybe_log_cluster_state(struct tevent_context *ev,
2872 struct tevent_timer *te,
2873 struct timeval current_time,
2876 struct ctdb_recoverd *rec = talloc_get_type_abort(
2877 private_data, struct ctdb_recoverd);
2878 struct ctdb_context *ctdb = rec->ctdb;
2879 struct tevent_timer *tt;
2881 static struct timeval start_incomplete = {
2889 unsigned int minutes;
2890 unsigned int num_connected;
2892 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2896 if (rec->nodemap == NULL) {
2902 for (i = 0; i < rec->nodemap->num; i++) {
2903 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2905 if (n->pnn == ctdb_get_pnn(ctdb)) {
2908 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2911 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2912 is_complete = false;
2919 was_complete = timeval_is_zero(&start_incomplete);
2922 if (! was_complete) {
2923 D_WARNING("Cluster complete with master=%u\n",
2925 start_incomplete = timeval_zero();
2930 /* Cluster is newly incomplete... */
2932 start_incomplete = current_time;
2938 * Cluster has been incomplete since previous check, so figure
2939 * out how long (in minutes) and decide whether to log anything
2941 seconds = timeval_elapsed2(&start_incomplete, ¤t_time);
2942 minutes = (unsigned int)seconds / 60;
2943 if (minutes >= 60) {
2944 /* Over an hour, log every hour */
2945 if (minutes % 60 != 0) {
2948 } else if (minutes >= 10) {
2949 /* Over 10 minutes, log every 10 minutes */
2950 if (minutes % 10 != 0) {
2956 D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2963 tt = tevent_add_timer(ctdb->ev,
2965 timeval_current_ofs(60, 0),
2966 maybe_log_cluster_state,
2969 DBG_WARNING("Failed to set up cluster state timer\n");
2974 the main monitoring loop
2976 static void monitor_cluster(struct ctdb_context *ctdb)
2978 struct tevent_signal *se;
2979 struct ctdb_recoverd *rec;
2981 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2983 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2984 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2987 rec->recmaster = CTDB_UNKNOWN_PNN;
2988 rec->recovery_lock_handle = NULL;
2990 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2991 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2993 rec->recovery = ctdb_op_init(rec, "recoveries");
2994 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2996 rec->priority_time = timeval_current();
2997 rec->frozen_on_inactive = false;
2999 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3000 recd_sig_term_handler, rec);
3002 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3006 if (ctdb->recovery_lock == NULL) {
3007 struct tevent_timer *tt;
3009 tt = tevent_add_timer(ctdb->ev,
3011 timeval_current_ofs(60, 0),
3012 maybe_log_cluster_state,
3015 DBG_WARNING("Failed to set up cluster state timer\n");
3019 /* register a message port for sending memory dumps */
3020 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3022 /* when a node is assigned banning credits */
3023 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3024 banning_handler, rec);
3026 /* register a message port for recovery elections */
3027 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3029 /* when nodes are disabled/enabled */
3030 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3032 /* when we are asked to puch out a flag change */
3033 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3035 /* register a message port for reloadnodes */
3036 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3038 /* register a message port for performing a takeover run */
3039 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3041 /* register a message port for disabling the ip check for a short while */
3042 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3044 /* register a message port for forcing a rebalance of a node next
3046 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3048 /* Register a message port for disabling takeover runs */
3049 ctdb_client_set_message_handler(ctdb,
3050 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3051 disable_takeover_runs_handler, rec);
3053 /* Register a message port for disabling recoveries */
3054 ctdb_client_set_message_handler(ctdb,
3055 CTDB_SRVID_DISABLE_RECOVERIES,
3056 disable_recoveries_handler, rec);
3059 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3060 struct timeval start;
3064 DEBUG(DEBUG_CRIT,(__location__
3065 " Failed to create temp context\n"));
3069 start = timeval_current();
3070 main_loop(ctdb, rec, mem_ctx);
3071 talloc_free(mem_ctx);
3073 /* we only check for recovery once every second */
3074 elapsed = timeval_elapsed(&start);
3075 if (elapsed < ctdb->tunable.recover_interval) {
3076 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3083 event handler for when the main ctdbd dies
3085 static void ctdb_recoverd_parent(struct tevent_context *ev,
3086 struct tevent_fd *fde,
3087 uint16_t flags, void *private_data)
3089 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3094 called regularly to verify that the recovery daemon is still running
3096 static void ctdb_check_recd(struct tevent_context *ev,
3097 struct tevent_timer *te,
3098 struct timeval yt, void *p)
3100 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3102 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3103 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3105 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3106 ctdb_restart_recd, ctdb);
3111 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3112 timeval_current_ofs(30, 0),
3113 ctdb_check_recd, ctdb);
3116 static void recd_sig_child_handler(struct tevent_context *ev,
3117 struct tevent_signal *se, int signum,
3118 int count, void *dont_care,
3121 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3126 pid = waitpid(-1, &status, WNOHANG);
3128 if (errno != ECHILD) {
3129 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3134 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3140 startup the recovery daemon as a child of the main ctdb daemon
3142 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3145 struct tevent_signal *se;
3146 struct tevent_fd *fde;
3149 if (pipe(fd) != 0) {
3153 ctdb->recoverd_pid = ctdb_fork(ctdb);
3154 if (ctdb->recoverd_pid == -1) {
3158 if (ctdb->recoverd_pid != 0) {
3159 talloc_free(ctdb->recd_ctx);
3160 ctdb->recd_ctx = talloc_new(ctdb);
3161 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3164 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3165 timeval_current_ofs(30, 0),
3166 ctdb_check_recd, ctdb);
3172 srandom(getpid() ^ time(NULL));
3174 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3179 prctl_set_comment("ctdb_recoverd");
3180 if (switch_from_server_to_client(ctdb) != 0) {
3181 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3185 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3187 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3188 ctdb_recoverd_parent, &fd[0]);
3189 tevent_fd_set_auto_close(fde);
3191 /* set up a handler to pick up sigchld */
3192 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3193 recd_sig_child_handler, ctdb);
3195 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3199 monitor_cluster(ctdb);
3201 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3206 shutdown the recovery daemon
3208 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3210 if (ctdb->recoverd_pid == 0) {
3214 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3215 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3217 TALLOC_FREE(ctdb->recd_ctx);
3218 TALLOC_FREE(ctdb->recd_ping_count);
3221 static void ctdb_restart_recd(struct tevent_context *ev,
3222 struct tevent_timer *te,
3223 struct timeval t, void *private_data)
3225 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3227 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3228 ctdb_stop_recoverd(ctdb);
3229 ctdb_start_recoverd(ctdb);