4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* most recent reload all ips request we need to perform during the
36 struct reloadips_all_reply *reload_all_ips_request = NULL;
38 /* list of "ctdb ipreallocate" processes to call back when we have
39 finished the takeover run.
41 struct ip_reallocate_list {
42 struct ip_reallocate_list *next;
43 struct rd_memdump_reply *rd;
46 struct ctdb_banning_state {
48 struct timeval last_reported_time;
52 private state of recovery daemon
54 struct ctdb_recoverd {
55 struct ctdb_context *ctdb;
58 uint32_t num_connected;
59 uint32_t last_culprit_node;
60 struct ctdb_node_map *nodemap;
61 struct timeval priority_time;
62 bool need_takeover_run;
65 struct timed_event *send_election_te;
66 struct timed_event *election_timeout;
67 struct vacuum_info *vacuum_info;
68 TALLOC_CTX *ip_reallocate_ctx;
69 struct ip_reallocate_list *reallocate_callers;
70 bool takeover_run_in_progress;
71 TALLOC_CTX *ip_check_disable_ctx;
72 struct ctdb_control_get_ifaces *ifaces;
73 TALLOC_CTX *deferred_rebalance_ctx;
76 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
77 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
79 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
82 ban a node for a period of time
84 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
87 struct ctdb_context *ctdb = rec->ctdb;
88 struct ctdb_ban_time bantime;
90 if (!ctdb_validate_pnn(ctdb, pnn)) {
91 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
95 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
98 bantime.time = ban_time;
100 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
102 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
108 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
112 remember the trouble maker
114 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
116 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
117 struct ctdb_banning_state *ban_state;
119 if (culprit > ctdb->num_nodes) {
120 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
124 /* If we are banned or stopped, do not set other nodes as culprits */
125 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
126 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
130 if (ctdb->nodes[culprit]->ban_state == NULL) {
131 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
132 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
136 ban_state = ctdb->nodes[culprit]->ban_state;
137 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
138 /* this was the first time in a long while this node
139 misbehaved so we will forgive any old transgressions.
141 ban_state->count = 0;
144 ban_state->count += count;
145 ban_state->last_reported_time = timeval_current();
146 rec->last_culprit_node = culprit;
150 remember the trouble maker
152 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
154 ctdb_set_culprit_count(rec, culprit, 1);
158 /* this callback is called for every node that failed to execute the
161 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
163 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
165 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
167 ctdb_set_culprit(rec, node_pnn);
171 run the "recovered" eventscript on all nodes
173 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
177 struct ctdb_context *ctdb = rec->ctdb;
179 tmp_ctx = talloc_new(ctdb);
180 CTDB_NO_MEMORY(ctdb, tmp_ctx);
182 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
183 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
185 CONTROL_TIMEOUT(), false, tdb_null,
186 NULL, recovered_fail_callback,
188 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
190 talloc_free(tmp_ctx);
194 talloc_free(tmp_ctx);
198 /* this callback is called for every node that failed to execute the
201 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
203 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
205 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
207 ctdb_set_culprit(rec, node_pnn);
211 run the "startrecovery" eventscript on all nodes
213 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
217 struct ctdb_context *ctdb = rec->ctdb;
219 tmp_ctx = talloc_new(ctdb);
220 CTDB_NO_MEMORY(ctdb, tmp_ctx);
222 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
223 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
225 CONTROL_TIMEOUT(), false, tdb_null,
227 startrecovery_fail_callback,
229 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
230 talloc_free(tmp_ctx);
234 talloc_free(tmp_ctx);
238 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
240 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
241 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
244 if (node_pnn < ctdb->num_nodes) {
245 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
248 if (node_pnn == ctdb->pnn) {
249 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
254 update the node capabilities for all connected nodes
256 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
261 tmp_ctx = talloc_new(ctdb);
262 CTDB_NO_MEMORY(ctdb, tmp_ctx);
264 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
265 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
269 async_getcap_callback, NULL,
271 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
272 talloc_free(tmp_ctx);
276 talloc_free(tmp_ctx);
280 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
282 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
284 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
285 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
288 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
290 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
292 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
293 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
297 change recovery mode on all nodes
299 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
305 tmp_ctx = talloc_new(ctdb);
306 CTDB_NO_MEMORY(ctdb, tmp_ctx);
308 /* freeze all nodes */
309 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
310 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
313 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
314 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
319 set_recmode_fail_callback,
321 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
322 talloc_free(tmp_ctx);
329 data.dsize = sizeof(uint32_t);
330 data.dptr = (unsigned char *)&rec_mode;
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
338 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
339 talloc_free(tmp_ctx);
343 talloc_free(tmp_ctx);
348 change recovery master on all node
350 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
356 tmp_ctx = talloc_new(ctdb);
357 CTDB_NO_MEMORY(ctdb, tmp_ctx);
359 data.dsize = sizeof(uint32_t);
360 data.dptr = (unsigned char *)&pnn;
362 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
363 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
365 CONTROL_TIMEOUT(), false, data,
368 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
369 talloc_free(tmp_ctx);
373 talloc_free(tmp_ctx);
377 /* update all remote nodes to use the same db priority that we have
378 this can fail if the remove node has not yet been upgraded to
379 support this function, so we always return success and never fail
380 a recovery if this call fails.
382 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
383 struct ctdb_node_map *nodemap,
384 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
389 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
391 /* step through all local databases */
392 for (db=0; db<dbmap->num;db++) {
394 struct ctdb_db_priority db_prio;
397 db_prio.db_id = dbmap->dbs[db].dbid;
398 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
400 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
404 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
406 data.dptr = (uint8_t *)&db_prio;
407 data.dsize = sizeof(db_prio);
409 if (ctdb_client_async_control(ctdb,
410 CTDB_CONTROL_SET_DB_PRIORITY,
412 CONTROL_TIMEOUT(), false, data,
415 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
423 ensure all other nodes have attached to any databases that we have
425 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
426 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
429 struct ctdb_dbid_map *remote_dbmap;
431 /* verify that all other nodes have all our databases */
432 for (j=0; j<nodemap->num; j++) {
433 /* we dont need to ourself ourselves */
434 if (nodemap->nodes[j].pnn == pnn) {
437 /* dont check nodes that are unavailable */
438 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
442 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
443 mem_ctx, &remote_dbmap);
445 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
449 /* step through all local databases */
450 for (db=0; db<dbmap->num;db++) {
454 for (i=0;i<remote_dbmap->num;i++) {
455 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
459 /* the remote node already have this database */
460 if (i!=remote_dbmap->num) {
463 /* ok so we need to create this database */
464 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
467 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
470 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
472 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
474 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
485 ensure we are attached to any databases that anyone else is attached to
487 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
488 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
491 struct ctdb_dbid_map *remote_dbmap;
493 /* verify that we have all database any other node has */
494 for (j=0; j<nodemap->num; j++) {
495 /* we dont need to ourself ourselves */
496 if (nodemap->nodes[j].pnn == pnn) {
499 /* dont check nodes that are unavailable */
500 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
504 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
505 mem_ctx, &remote_dbmap);
507 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
511 /* step through all databases on the remote node */
512 for (db=0; db<remote_dbmap->num;db++) {
515 for (i=0;i<(*dbmap)->num;i++) {
516 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
520 /* we already have this db locally */
521 if (i!=(*dbmap)->num) {
524 /* ok so we need to create this database and
527 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
528 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
530 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
531 nodemap->nodes[j].pnn));
534 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
535 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
537 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
540 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
542 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
553 pull the remote database contents from one node into the recdb
555 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
556 struct tdb_wrap *recdb, uint32_t dbid)
560 struct ctdb_marshall_buffer *reply;
561 struct ctdb_rec_data *rec;
563 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
565 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
566 CONTROL_TIMEOUT(), &outdata);
568 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
569 talloc_free(tmp_ctx);
573 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
575 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
576 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
577 talloc_free(tmp_ctx);
581 rec = (struct ctdb_rec_data *)&reply->data[0];
585 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
587 struct ctdb_ltdb_header *hdr;
590 key.dptr = &rec->data[0];
591 key.dsize = rec->keylen;
592 data.dptr = &rec->data[key.dsize];
593 data.dsize = rec->datalen;
595 hdr = (struct ctdb_ltdb_header *)data.dptr;
597 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
598 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
599 talloc_free(tmp_ctx);
603 /* fetch the existing record, if any */
604 existing = tdb_fetch(recdb->tdb, key);
606 if (existing.dptr != NULL) {
607 struct ctdb_ltdb_header header;
608 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
609 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
610 (unsigned)existing.dsize, srcnode));
612 talloc_free(tmp_ctx);
615 header = *(struct ctdb_ltdb_header *)existing.dptr;
617 if (!(header.rsn < hdr->rsn ||
618 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
623 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
624 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
625 talloc_free(tmp_ctx);
630 talloc_free(tmp_ctx);
636 struct pull_seqnum_cbdata {
642 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
644 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
647 if (cb_data->failed != 0) {
648 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
653 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
658 if (outdata.dsize != sizeof(uint64_t)) {
659 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
660 cb_data->failed = -1;
664 seqnum = *((uint64_t *)outdata.dptr);
666 if (seqnum > cb_data->seqnum) {
667 cb_data->seqnum = seqnum;
668 cb_data->pnn = node_pnn;
672 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
674 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
676 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
680 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
681 struct ctdb_recoverd *rec,
682 struct ctdb_node_map *nodemap,
683 struct tdb_wrap *recdb, uint32_t dbid)
685 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
689 struct pull_seqnum_cbdata *cb_data;
691 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
696 data.dsize = sizeof(outdata);
697 data.dptr = (uint8_t *)&outdata[0];
699 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
700 if (cb_data == NULL) {
701 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
702 talloc_free(tmp_ctx);
710 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
711 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
713 CONTROL_TIMEOUT(), false, data,
717 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
719 talloc_free(tmp_ctx);
723 if (cb_data->failed != 0) {
724 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
725 talloc_free(tmp_ctx);
729 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
730 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
731 talloc_free(tmp_ctx);
735 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
737 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
738 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
739 talloc_free(tmp_ctx);
743 talloc_free(tmp_ctx);
749 pull all the remote database contents into the recdb
751 static int pull_remote_database(struct ctdb_context *ctdb,
752 struct ctdb_recoverd *rec,
753 struct ctdb_node_map *nodemap,
754 struct tdb_wrap *recdb, uint32_t dbid,
759 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
761 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
767 /* pull all records from all other nodes across onto this node
768 (this merges based on rsn)
770 for (j=0; j<nodemap->num; j++) {
771 /* dont merge from nodes that are unavailable */
772 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
775 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
776 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
777 nodemap->nodes[j].pnn));
778 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
788 update flags on all active nodes
790 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
794 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
796 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
804 ensure all nodes have the same vnnmap we do
806 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
807 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
811 /* push the new vnn map out to all the nodes */
812 for (j=0; j<nodemap->num; j++) {
813 /* dont push to nodes that are unavailable */
814 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
818 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
820 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
830 struct vacuum_info *next, *prev;
831 struct ctdb_recoverd *rec;
833 struct ctdb_db_context *ctdb_db;
834 struct ctdb_marshall_buffer *recs;
835 struct ctdb_rec_data *r;
838 static void vacuum_fetch_next(struct vacuum_info *v);
841 called when a vacuum fetch has completed - just free it and do the next one
843 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
845 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
847 vacuum_fetch_next(v);
852 process the next element from the vacuum list
854 static void vacuum_fetch_next(struct vacuum_info *v)
856 struct ctdb_call call;
857 struct ctdb_rec_data *r;
859 while (v->recs->count) {
860 struct ctdb_client_call_state *state;
862 struct ctdb_ltdb_header *hdr;
865 call.call_id = CTDB_NULL_FUNC;
866 call.flags = CTDB_IMMEDIATE_MIGRATION;
867 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
870 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
873 call.key.dptr = &r->data[0];
874 call.key.dsize = r->keylen;
876 /* ensure we don't block this daemon - just skip a record if we can't get
878 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
882 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
883 if (data.dptr == NULL) {
884 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
888 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
890 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
894 hdr = (struct ctdb_ltdb_header *)data.dptr;
895 if (hdr->dmaster == v->rec->ctdb->pnn) {
896 /* its already local */
898 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
904 state = ctdb_call_send(v->ctdb_db, &call);
905 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
907 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
911 state->async.fn = vacuum_fetch_callback;
912 state->async.private_data = v;
921 destroy a vacuum info structure
923 static int vacuum_info_destructor(struct vacuum_info *v)
925 DLIST_REMOVE(v->rec->vacuum_info, v);
931 handler for vacuum fetch
933 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
934 TDB_DATA data, void *private_data)
936 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
937 struct ctdb_marshall_buffer *recs;
939 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
941 struct ctdb_dbid_map *dbmap=NULL;
942 bool persistent = false;
943 struct ctdb_db_context *ctdb_db;
944 struct ctdb_rec_data *r;
946 struct vacuum_info *v;
948 recs = (struct ctdb_marshall_buffer *)data.dptr;
949 r = (struct ctdb_rec_data *)&recs->data[0];
951 if (recs->count == 0) {
952 talloc_free(tmp_ctx);
958 for (v=rec->vacuum_info;v;v=v->next) {
959 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
960 /* we're already working on records from this node */
961 talloc_free(tmp_ctx);
966 /* work out if the database is persistent */
967 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
969 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
970 talloc_free(tmp_ctx);
974 for (i=0;i<dbmap->num;i++) {
975 if (dbmap->dbs[i].dbid == recs->db_id) {
976 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
980 if (i == dbmap->num) {
981 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
982 talloc_free(tmp_ctx);
986 /* find the name of this database */
987 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
988 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
989 talloc_free(tmp_ctx);
994 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
995 if (ctdb_db == NULL) {
996 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
997 talloc_free(tmp_ctx);
1001 v = talloc_zero(rec, struct vacuum_info);
1003 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1004 talloc_free(tmp_ctx);
1009 v->srcnode = srcnode;
1010 v->ctdb_db = ctdb_db;
1011 v->recs = talloc_memdup(v, recs, data.dsize);
1012 if (v->recs == NULL) {
1013 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1015 talloc_free(tmp_ctx);
1018 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1020 DLIST_ADD(rec->vacuum_info, v);
1022 talloc_set_destructor(v, vacuum_info_destructor);
1024 vacuum_fetch_next(v);
1025 talloc_free(tmp_ctx);
1030 called when ctdb_wait_timeout should finish
1032 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1033 struct timeval yt, void *p)
1035 uint32_t *timed_out = (uint32_t *)p;
1040 wait for a given number of seconds
1042 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1044 uint32_t timed_out = 0;
1045 time_t usecs = (secs - (time_t)secs) * 1000000;
1046 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1047 while (!timed_out) {
1048 event_loop_once(ctdb->ev);
1053 called when an election times out (ends)
1055 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1056 struct timeval t, void *p)
1058 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1059 rec->election_timeout = NULL;
1062 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1067 wait for an election to finish. It finished election_timeout seconds after
1068 the last election packet is received
1070 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1072 struct ctdb_context *ctdb = rec->ctdb;
1073 while (rec->election_timeout) {
1074 event_loop_once(ctdb->ev);
1079 Update our local flags from all remote connected nodes.
1080 This is only run when we are or we belive we are the recovery master
1082 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1085 struct ctdb_context *ctdb = rec->ctdb;
1086 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1088 /* get the nodemap for all active remote nodes and verify
1089 they are the same as for this node
1091 for (j=0; j<nodemap->num; j++) {
1092 struct ctdb_node_map *remote_nodemap=NULL;
1095 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1098 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1102 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1103 mem_ctx, &remote_nodemap);
1105 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1106 nodemap->nodes[j].pnn));
1107 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1108 talloc_free(mem_ctx);
1109 return MONITOR_FAILED;
1111 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1112 /* We should tell our daemon about this so it
1113 updates its flags or else we will log the same
1114 message again in the next iteration of recovery.
1115 Since we are the recovery master we can just as
1116 well update the flags on all nodes.
1118 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1120 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1124 /* Update our local copy of the flags in the recovery
1127 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1128 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1129 nodemap->nodes[j].flags));
1130 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1132 talloc_free(remote_nodemap);
1134 talloc_free(mem_ctx);
1139 /* Create a new random generation ip.
1140 The generation id can not be the INVALID_GENERATION id
1142 static uint32_t new_generation(void)
1144 uint32_t generation;
1147 generation = random();
1149 if (generation != INVALID_GENERATION) {
1159 create a temporary working database
1161 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1164 struct tdb_wrap *recdb;
1167 /* open up the temporary recovery database */
1168 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1169 ctdb->db_directory_state,
1176 tdb_flags = TDB_NOLOCK;
1177 if (ctdb->valgrinding) {
1178 tdb_flags |= TDB_NOMMAP;
1180 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1182 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1183 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1184 if (recdb == NULL) {
1185 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1195 a traverse function for pulling all relevant records from recdb
1198 struct ctdb_context *ctdb;
1199 struct ctdb_marshall_buffer *recdata;
1201 uint32_t allocated_len;
1206 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1208 struct recdb_data *params = (struct recdb_data *)p;
1209 struct ctdb_rec_data *rec;
1210 struct ctdb_ltdb_header *hdr;
1213 * skip empty records - but NOT for persistent databases:
1215 * The record-by-record mode of recovery deletes empty records.
1216 * For persistent databases, this can lead to data corruption
1217 * by deleting records that should be there:
1219 * - Assume the cluster has been running for a while.
1221 * - A record R in a persistent database has been created and
1222 * deleted a couple of times, the last operation being deletion,
1223 * leaving an empty record with a high RSN, say 10.
1225 * - Now a node N is turned off.
1227 * - This leaves the local database copy of D on N with the empty
1228 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1229 * the copy of record R.
1231 * - Now the record is created again while node N is turned off.
1232 * This creates R with RSN = 1 on all nodes except for N.
1234 * - Now node N is turned on again. The following recovery will chose
1235 * the older empty copy of R due to RSN 10 > RSN 1.
1237 * ==> Hence the record is gone after the recovery.
1239 * On databases like Samba's registry, this can damage the higher-level
1240 * data structures built from the various tdb-level records.
1242 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1246 /* update the dmaster field to point to us */
1247 hdr = (struct ctdb_ltdb_header *)data.dptr;
1248 if (!params->persistent) {
1249 hdr->dmaster = params->ctdb->pnn;
1250 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1253 /* add the record to the blob ready to send to the nodes */
1254 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1256 params->failed = true;
1259 if (params->len + rec->length >= params->allocated_len) {
1260 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1261 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1263 if (params->recdata == NULL) {
1264 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1265 rec->length + params->len));
1266 params->failed = true;
1269 params->recdata->count++;
1270 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1271 params->len += rec->length;
1278 push the recdb database out to all nodes
1280 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1282 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1284 struct recdb_data params;
1285 struct ctdb_marshall_buffer *recdata;
1287 TALLOC_CTX *tmp_ctx;
1290 tmp_ctx = talloc_new(ctdb);
1291 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1293 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1294 CTDB_NO_MEMORY(ctdb, recdata);
1296 recdata->db_id = dbid;
1299 params.recdata = recdata;
1300 params.len = offsetof(struct ctdb_marshall_buffer, data);
1301 params.allocated_len = params.len;
1302 params.failed = false;
1303 params.persistent = persistent;
1305 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1306 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1307 talloc_free(params.recdata);
1308 talloc_free(tmp_ctx);
1312 if (params.failed) {
1313 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1314 talloc_free(params.recdata);
1315 talloc_free(tmp_ctx);
1319 recdata = params.recdata;
1321 outdata.dptr = (void *)recdata;
1322 outdata.dsize = params.len;
1324 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1325 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1327 CONTROL_TIMEOUT(), false, outdata,
1330 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1331 talloc_free(recdata);
1332 talloc_free(tmp_ctx);
1336 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1337 dbid, recdata->count));
1339 talloc_free(recdata);
1340 talloc_free(tmp_ctx);
1347 go through a full recovery on one database
1349 static int recover_database(struct ctdb_recoverd *rec,
1350 TALLOC_CTX *mem_ctx,
1354 struct ctdb_node_map *nodemap,
1355 uint32_t transaction_id)
1357 struct tdb_wrap *recdb;
1359 struct ctdb_context *ctdb = rec->ctdb;
1361 struct ctdb_control_wipe_database w;
1364 recdb = create_recdb(ctdb, mem_ctx);
1365 if (recdb == NULL) {
1369 /* pull all remote databases onto the recdb */
1370 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1372 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1376 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1378 /* wipe all the remote databases. This is safe as we are in a transaction */
1380 w.transaction_id = transaction_id;
1382 data.dptr = (void *)&w;
1383 data.dsize = sizeof(w);
1385 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1386 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1388 CONTROL_TIMEOUT(), false, data,
1391 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1396 /* push out the correct database. This sets the dmaster and skips
1397 the empty records */
1398 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1404 /* all done with this database */
1411 reload the nodes file
1413 static void reload_nodes_file(struct ctdb_context *ctdb)
1416 ctdb_load_nodes_file(ctdb);
1419 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1420 struct ctdb_recoverd *rec,
1421 struct ctdb_node_map *nodemap,
1427 if (ctdb->num_nodes != nodemap->num) {
1428 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1429 ctdb->num_nodes, nodemap->num));
1431 *culprit = ctdb->pnn;
1436 for (j=0; j<nodemap->num; j++) {
1437 /* For readability */
1438 struct ctdb_node *node = ctdb->nodes[j];
1440 /* release any existing data */
1441 if (node->known_public_ips) {
1442 talloc_free(node->known_public_ips);
1443 node->known_public_ips = NULL;
1445 if (node->available_public_ips) {
1446 talloc_free(node->available_public_ips);
1447 node->available_public_ips = NULL;
1450 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1454 /* Retrieve the list of known public IPs from the node */
1455 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1460 &node->known_public_ips);
1463 ("Failed to read known public IPs from node: %u\n",
1466 *culprit = node->pnn;
1471 if (ctdb->do_checkpublicip &&
1472 (rec->ip_check_disable_ctx == NULL) &&
1473 verify_remote_ip_allocation(ctdb,
1474 node->known_public_ips,
1476 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1477 rec->need_takeover_run = true;
1480 /* Retrieve the list of available public IPs from the node */
1481 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1485 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1486 &node->available_public_ips);
1489 ("Failed to read available public IPs from node: %u\n",
1492 *culprit = node->pnn;
1501 /* when we start a recovery, make sure all nodes use the same reclock file
1504 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1506 struct ctdb_context *ctdb = rec->ctdb;
1507 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1511 if (ctdb->recovery_lock_file == NULL) {
1515 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1516 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1519 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1520 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1526 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1527 talloc_free(tmp_ctx);
1531 talloc_free(tmp_ctx);
1537 * this callback is called for every node that failed to execute ctdb_takeover_run()
1538 * and set flag to re-run takeover run.
1540 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1542 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1544 if (callback_data != NULL) {
1545 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1547 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1549 ctdb_set_culprit(rec, node_pnn);
1554 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1556 struct ctdb_context *ctdb = rec->ctdb;
1558 struct ctdb_banning_state *ban_state;
1561 for (i=0; i<ctdb->num_nodes; i++) {
1562 if (ctdb->nodes[i]->ban_state == NULL) {
1565 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1566 if (ban_state->count < 2*ctdb->num_nodes) {
1570 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1571 ctdb->nodes[i]->pnn, ban_state->count,
1572 ctdb->tunable.recovery_ban_period));
1573 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1574 ban_state->count = 0;
1576 /* Banning ourself? */
1577 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1583 static bool do_takeover_run(struct ctdb_recoverd *rec,
1584 struct ctdb_node_map *nodemap,
1585 bool banning_credits_on_fail)
1587 uint32_t disable_timeout;
1592 if (rec->takeover_run_in_progress) {
1593 DEBUG(DEBUG_ERR, (__location__
1594 " takeover run already in progress \n"));
1599 /* Disable IP checks while doing this takeover run. This will
1600 * stop those other nodes from triggering takeover runs when
1601 * think they should be hosting an IP but it isn't yet on an
1604 data.dptr = (uint8_t*)&disable_timeout;
1605 data.dsize = sizeof(disable_timeout);
1607 disable_timeout = rec->ctdb->tunable.takeover_timeout;
1608 if (ctdb_client_send_message(rec->ctdb, CTDB_BROADCAST_CONNECTED,
1609 CTDB_SRVID_DISABLE_IP_CHECK,
1611 DEBUG(DEBUG_INFO,("Failed to disable IP check\n"));
1614 rec->takeover_run_in_progress = true;
1616 ret = ctdb_takeover_run(rec->ctdb, nodemap, takeover_fail_callback,
1617 banning_credits_on_fail ? rec : NULL);
1619 /* Reenable IP checks */
1620 disable_timeout = 0;
1621 if (ctdb_client_send_message(rec->ctdb, CTDB_BROADCAST_CONNECTED,
1622 CTDB_SRVID_DISABLE_IP_CHECK,
1624 DEBUG(DEBUG_INFO,("Failed to reenable IP check\n"));
1628 DEBUG(DEBUG_ERR, ("IP reallocation failed\n"));
1635 rec->need_takeover_run = !ok;
1636 rec->takeover_run_in_progress = false;
1642 we are the recmaster, and recovery is needed - start a recovery run
1644 static int do_recovery(struct ctdb_recoverd *rec,
1645 TALLOC_CTX *mem_ctx, uint32_t pnn,
1646 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1648 struct ctdb_context *ctdb = rec->ctdb;
1650 uint32_t generation;
1651 struct ctdb_dbid_map *dbmap;
1654 struct timeval start_time;
1655 uint32_t culprit = (uint32_t)-1;
1658 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1660 /* if recovery fails, force it again */
1661 rec->need_recovery = true;
1663 ban_misbehaving_nodes(rec, &self_ban);
1665 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1669 if (ctdb->tunable.verify_recovery_lock != 0) {
1670 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1671 start_time = timeval_current();
1672 if (!ctdb_recovery_lock(ctdb, true)) {
1673 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1674 "and ban ourself for %u seconds\n",
1675 ctdb->tunable.recovery_ban_period));
1676 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1679 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1680 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1683 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1685 /* get a list of all databases */
1686 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1688 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1692 /* we do the db creation before we set the recovery mode, so the freeze happens
1693 on all databases we will be dealing with. */
1695 /* verify that we have all the databases any other node has */
1696 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1698 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1702 /* verify that all other nodes have all our databases */
1703 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1705 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1708 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1710 /* update the database priority for all remote databases */
1711 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1713 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1715 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1718 /* update all other nodes to use the same setting for reclock files
1719 as the local recovery master.
1721 sync_recovery_lock_file_across_cluster(rec);
1723 /* set recovery mode to active on all nodes */
1724 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1726 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1730 /* execute the "startrecovery" event script on all nodes */
1731 ret = run_startrecovery_eventscript(rec, nodemap);
1733 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1738 update all nodes to have the same flags that we have
1740 for (i=0;i<nodemap->num;i++) {
1741 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1745 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1747 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1752 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1754 /* pick a new generation number */
1755 generation = new_generation();
1757 /* change the vnnmap on this node to use the new generation
1758 number but not on any other nodes.
1759 this guarantees that if we abort the recovery prematurely
1760 for some reason (a node stops responding?)
1761 that we can just return immediately and we will reenter
1762 recovery shortly again.
1763 I.e. we deliberately leave the cluster with an inconsistent
1764 generation id to allow us to abort recovery at any stage and
1765 just restart it from scratch.
1767 vnnmap->generation = generation;
1768 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1770 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1774 data.dptr = (void *)&generation;
1775 data.dsize = sizeof(uint32_t);
1777 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1778 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1780 CONTROL_TIMEOUT(), false, data,
1782 transaction_start_fail_callback,
1784 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1785 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1787 CONTROL_TIMEOUT(), false, tdb_null,
1791 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1796 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1798 for (i=0;i<dbmap->num;i++) {
1799 ret = recover_database(rec, mem_ctx,
1801 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1802 pnn, nodemap, generation);
1804 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1809 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1811 /* commit all the changes */
1812 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1814 CONTROL_TIMEOUT(), false, data,
1817 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1821 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1824 /* update the capabilities for all nodes */
1825 ret = update_capabilities(ctdb, nodemap);
1827 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1831 /* build a new vnn map with all the currently active and
1833 generation = new_generation();
1834 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1835 CTDB_NO_MEMORY(ctdb, vnnmap);
1836 vnnmap->generation = generation;
1838 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1839 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1840 for (i=j=0;i<nodemap->num;i++) {
1841 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1844 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1845 /* this node can not be an lmaster */
1846 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1851 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1852 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1853 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1856 if (vnnmap->size == 0) {
1857 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1859 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1860 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1861 vnnmap->map[0] = pnn;
1864 /* update to the new vnnmap on all nodes */
1865 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1867 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1871 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1873 /* update recmaster to point to us for all nodes */
1874 ret = set_recovery_master(ctdb, nodemap, pnn);
1876 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1880 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1883 update all nodes to have the same flags that we have
1885 for (i=0;i<nodemap->num;i++) {
1886 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1890 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1892 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1897 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1899 /* disable recovery mode */
1900 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1902 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1906 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1908 /* Fetch known/available public IPs from each active node */
1909 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1911 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1913 rec->need_takeover_run = true;
1917 do_takeover_run(rec, nodemap, false);
1919 /* execute the "recovered" event script on all nodes */
1920 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1922 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1926 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1928 /* send a message to all clients telling them that the cluster
1929 has been reconfigured */
1930 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1932 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1934 rec->need_recovery = false;
1936 /* we managed to complete a full recovery, make sure to forgive
1937 any past sins by the nodes that could now participate in the
1940 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1941 for (i=0;i<nodemap->num;i++) {
1942 struct ctdb_banning_state *ban_state;
1944 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1948 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1949 if (ban_state == NULL) {
1953 ban_state->count = 0;
1957 /* We just finished a recovery successfully.
1958 We now wait for rerecovery_timeout before we allow
1959 another recovery to take place.
1961 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1962 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1963 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1970 elections are won by first checking the number of connected nodes, then
1971 the priority time, then the pnn
1973 struct election_message {
1974 uint32_t num_connected;
1975 struct timeval priority_time;
1977 uint32_t node_flags;
1981 form this nodes election data
1983 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1986 struct ctdb_node_map *nodemap;
1987 struct ctdb_context *ctdb = rec->ctdb;
1991 em->pnn = rec->ctdb->pnn;
1992 em->priority_time = rec->priority_time;
1994 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1996 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
2000 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2001 em->node_flags = rec->node_flags;
2003 for (i=0;i<nodemap->num;i++) {
2004 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2005 em->num_connected++;
2009 /* we shouldnt try to win this election if we cant be a recmaster */
2010 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2011 em->num_connected = 0;
2012 em->priority_time = timeval_current();
2015 talloc_free(nodemap);
2019 see if the given election data wins
2021 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2023 struct election_message myem;
2026 ctdb_election_data(rec, &myem);
2028 /* we cant win if we dont have the recmaster capability */
2029 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2033 /* we cant win if we are banned */
2034 if (rec->node_flags & NODE_FLAGS_BANNED) {
2038 /* we cant win if we are stopped */
2039 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2043 /* we will automatically win if the other node is banned */
2044 if (em->node_flags & NODE_FLAGS_BANNED) {
2048 /* we will automatically win if the other node is banned */
2049 if (em->node_flags & NODE_FLAGS_STOPPED) {
2053 /* try to use the most connected node */
2055 cmp = (int)myem.num_connected - (int)em->num_connected;
2058 /* then the longest running node */
2060 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2064 cmp = (int)myem.pnn - (int)em->pnn;
2071 send out an election request
2073 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
2076 TDB_DATA election_data;
2077 struct election_message emsg;
2079 struct ctdb_context *ctdb = rec->ctdb;
2081 srvid = CTDB_SRVID_RECOVERY;
2083 ctdb_election_data(rec, &emsg);
2085 election_data.dsize = sizeof(struct election_message);
2086 election_data.dptr = (unsigned char *)&emsg;
2089 /* send an election message to all active nodes */
2090 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2091 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2094 /* A new node that is already frozen has entered the cluster.
2095 The existing nodes are not frozen and dont need to be frozen
2096 until the election has ended and we start the actual recovery
2098 if (update_recmaster == true) {
2099 /* first we assume we will win the election and set
2100 recoverymaster to be ourself on the current node
2102 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2104 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2114 this function will unban all nodes in the cluster
2116 static void unban_all_nodes(struct ctdb_context *ctdb)
2119 struct ctdb_node_map *nodemap;
2120 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2122 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2124 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2128 for (i=0;i<nodemap->num;i++) {
2129 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2130 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2131 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
2135 talloc_free(tmp_ctx);
2140 we think we are winning the election - send a broadcast election request
2142 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2144 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2147 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
2149 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2152 talloc_free(rec->send_election_te);
2153 rec->send_election_te = NULL;
2157 handler for memory dumps
2159 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2160 TDB_DATA data, void *private_data)
2162 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2165 struct rd_memdump_reply *rd;
2167 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2168 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2169 talloc_free(tmp_ctx);
2172 rd = (struct rd_memdump_reply *)data.dptr;
2174 dump = talloc_zero(tmp_ctx, TDB_DATA);
2176 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2177 talloc_free(tmp_ctx);
2180 ret = ctdb_dump_memory(ctdb, dump);
2182 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2183 talloc_free(tmp_ctx);
2187 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2189 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2191 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2192 talloc_free(tmp_ctx);
2196 talloc_free(tmp_ctx);
2202 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2203 TDB_DATA data, void *private_data)
2205 struct ctdb_get_log_addr *log_addr;
2208 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2209 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2212 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2214 child = ctdb_fork_no_free_ringbuffer(ctdb);
2215 if (child == (pid_t)-1) {
2216 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2221 ctdb_set_process_name("ctdb_rec_log_collector");
2222 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2223 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2226 ctdb_collect_log(ctdb, log_addr);
2232 handler for clearlog
2234 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2235 TDB_DATA data, void *private_data)
2237 ctdb_clear_log(ctdb);
2241 handler for reload_nodes
2243 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2244 TDB_DATA data, void *private_data)
2246 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2248 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2250 reload_nodes_file(rec->ctdb);
2254 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2255 struct timeval yt, void *p)
2257 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2259 talloc_free(rec->ip_check_disable_ctx);
2260 rec->ip_check_disable_ctx = NULL;
2264 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2265 struct timeval t, void *p)
2267 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2270 ("Rebalance all nodes that have had ip assignment changes.\n"));
2272 do_takeover_run(rec, rec->nodemap, false);
2274 talloc_free(rec->deferred_rebalance_ctx);
2275 rec->deferred_rebalance_ctx = NULL;
2279 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2280 TDB_DATA data, void *private_data)
2283 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2285 if (data.dsize != sizeof(uint32_t)) {
2286 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2290 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2294 pnn = *(uint32_t *)&data.dptr[0];
2296 lcp2_forcerebalance(ctdb, pnn);
2297 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2299 if (rec->deferred_rebalance_ctx != NULL) {
2300 talloc_free(rec->deferred_rebalance_ctx);
2302 rec->deferred_rebalance_ctx = talloc_new(rec);
2303 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2304 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2305 ctdb_rebalance_timeout, rec);
2310 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2311 TDB_DATA data, void *private_data)
2313 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2314 struct ctdb_public_ip *ip;
2316 if (rec->recmaster != rec->ctdb->pnn) {
2317 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2321 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2322 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2326 ip = (struct ctdb_public_ip *)data.dptr;
2328 update_ip_assignment_tree(rec->ctdb, ip);
2332 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2333 TDB_DATA data, void *private_data)
2335 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2338 if (rec->ip_check_disable_ctx != NULL) {
2339 talloc_free(rec->ip_check_disable_ctx);
2340 rec->ip_check_disable_ctx = NULL;
2343 if (data.dsize != sizeof(uint32_t)) {
2344 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2345 "expexting %lu\n", (long unsigned)data.dsize,
2346 (long unsigned)sizeof(uint32_t)));
2349 if (data.dptr == NULL) {
2350 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2354 timeout = *((uint32_t *)data.dptr);
2357 DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
2361 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2363 rec->ip_check_disable_ctx = talloc_new(rec);
2364 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2366 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2371 handler for reload all ips.
2373 static void ip_reloadall_handler(struct ctdb_context *ctdb, uint64_t srvid,
2374 TDB_DATA data, void *private_data)
2376 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2378 if (data.dsize != sizeof(struct reloadips_all_reply)) {
2379 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2383 reload_all_ips_request = (struct reloadips_all_reply *)talloc_steal(rec, data.dptr);
2385 DEBUG(DEBUG_NOTICE,("RELOAD_ALL_IPS message received from node:%d srvid:%d\n", reload_all_ips_request->pnn, (int)reload_all_ips_request->srvid));
2389 static void async_reloadips_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2391 uint32_t *status = callback_data;
2394 DEBUG(DEBUG_ERR,("Reload ips all failed on node %d\n", node_pnn));
2400 reload_all_ips(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, struct reloadips_all_reply *rips)
2402 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2407 DEBUG(DEBUG_ERR,("RELOAD ALL IPS on all active nodes\n"));
2408 for (i = 0; i< nodemap->num; i++) {
2409 if (nodemap->nodes[i].flags != 0) {
2410 DEBUG(DEBUG_ERR, ("Can not reload ips on all nodes. Node %d is not up and healthy\n", i));
2411 talloc_free(tmp_ctx);
2416 /* send the flags update to all connected nodes */
2417 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2419 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
2423 async_reloadips_callback, NULL,
2425 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2426 talloc_free(tmp_ctx);
2431 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2432 talloc_free(tmp_ctx);
2436 ctdb_client_send_message(ctdb, rips->pnn, rips->srvid, tdb_null);
2438 talloc_free(tmp_ctx);
2444 handler for ip reallocate, just add it to the list of callers and
2445 handle this later in the monitor_cluster loop so we do not recurse
2446 with other callers to takeover_run()
2448 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2449 TDB_DATA data, void *private_data)
2451 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2452 struct ip_reallocate_list *caller;
2454 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2455 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2459 if (rec->ip_reallocate_ctx == NULL) {
2460 rec->ip_reallocate_ctx = talloc_new(rec);
2461 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2464 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2465 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2467 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2468 caller->next = rec->reallocate_callers;
2469 rec->reallocate_callers = caller;
2474 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2478 struct ip_reallocate_list *callers;
2481 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2483 /* update the list of public ips that a node can handle for
2486 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2488 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2490 rec->need_takeover_run = true;
2493 if (do_takeover_run(rec, rec->nodemap, false)) {
2500 result.dsize = sizeof(int32_t);
2501 result.dptr = (uint8_t *)&ret;
2503 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2505 /* Someone that sent srvid==0 does not want a reply */
2506 if (callers->rd->srvid == 0) {
2509 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2510 "%u:%llu\n", (unsigned)callers->rd->pnn,
2511 (unsigned long long)callers->rd->srvid));
2512 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2514 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2515 "message to %u:%llu\n",
2516 (unsigned)callers->rd->pnn,
2517 (unsigned long long)callers->rd->srvid));
2521 talloc_free(rec->ip_reallocate_ctx);
2522 rec->ip_reallocate_ctx = NULL;
2523 rec->reallocate_callers = NULL;
2528 handler for recovery master elections
2530 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2531 TDB_DATA data, void *private_data)
2533 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2535 struct election_message *em = (struct election_message *)data.dptr;
2536 TALLOC_CTX *mem_ctx;
2538 /* we got an election packet - update the timeout for the election */
2539 talloc_free(rec->election_timeout);
2540 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2542 timeval_current_ofs(0, 500000) :
2543 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2544 ctdb_election_timeout, rec);
2546 mem_ctx = talloc_new(ctdb);
2548 /* someone called an election. check their election data
2549 and if we disagree and we would rather be the elected node,
2550 send a new election message to all other nodes
2552 if (ctdb_election_win(rec, em)) {
2553 if (!rec->send_election_te) {
2554 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2555 timeval_current_ofs(0, 500000),
2556 election_send_request, rec);
2558 talloc_free(mem_ctx);
2559 /*unban_all_nodes(ctdb);*/
2564 talloc_free(rec->send_election_te);
2565 rec->send_election_te = NULL;
2567 if (ctdb->tunable.verify_recovery_lock != 0) {
2568 /* release the recmaster lock */
2569 if (em->pnn != ctdb->pnn &&
2570 ctdb->recovery_lock_fd != -1) {
2571 close(ctdb->recovery_lock_fd);
2572 ctdb->recovery_lock_fd = -1;
2573 unban_all_nodes(ctdb);
2577 /* ok, let that guy become recmaster then */
2578 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2580 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2581 talloc_free(mem_ctx);
2585 talloc_free(mem_ctx);
2591 force the start of the election process
2593 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2594 struct ctdb_node_map *nodemap)
2597 struct ctdb_context *ctdb = rec->ctdb;
2599 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2601 /* set all nodes to recovery mode to stop all internode traffic */
2602 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2604 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2608 talloc_free(rec->election_timeout);
2609 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2611 timeval_current_ofs(0, 500000) :
2612 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2613 ctdb_election_timeout, rec);
2615 ret = send_election_request(rec, pnn, true);
2617 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2621 /* wait for a few seconds to collect all responses */
2622 ctdb_wait_election(rec);
2628 handler for when a node changes its flags
2630 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2631 TDB_DATA data, void *private_data)
2634 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2635 struct ctdb_node_map *nodemap=NULL;
2636 TALLOC_CTX *tmp_ctx;
2638 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2639 int disabled_flag_changed;
2641 if (data.dsize != sizeof(*c)) {
2642 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2646 tmp_ctx = talloc_new(ctdb);
2647 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2649 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2651 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2652 talloc_free(tmp_ctx);
2657 for (i=0;i<nodemap->num;i++) {
2658 if (nodemap->nodes[i].pnn == c->pnn) break;
2661 if (i == nodemap->num) {
2662 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2663 talloc_free(tmp_ctx);
2667 if (c->old_flags != c->new_flags) {
2668 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2671 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2673 nodemap->nodes[i].flags = c->new_flags;
2675 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2676 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2679 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2680 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2684 ctdb->recovery_master == ctdb->pnn &&
2685 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2686 /* Only do the takeover run if the perm disabled or unhealthy
2687 flags changed since these will cause an ip failover but not
2689 If the node became disconnected or banned this will also
2690 lead to an ip address failover but that is handled
2693 if (disabled_flag_changed) {
2694 rec->need_takeover_run = true;
2698 talloc_free(tmp_ctx);
2702 handler for when we need to push out flag changes ot all other nodes
2704 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2705 TDB_DATA data, void *private_data)
2708 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2709 struct ctdb_node_map *nodemap=NULL;
2710 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2714 /* find the recovery master */
2715 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2717 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2718 talloc_free(tmp_ctx);
2722 /* read the node flags from the recmaster */
2723 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2725 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2726 talloc_free(tmp_ctx);
2729 if (c->pnn >= nodemap->num) {
2730 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2731 talloc_free(tmp_ctx);
2735 /* send the flags update to all connected nodes */
2736 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2738 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2739 nodes, 0, CONTROL_TIMEOUT(),
2743 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2745 talloc_free(tmp_ctx);
2749 talloc_free(tmp_ctx);
2753 struct verify_recmode_normal_data {
2755 enum monitor_result status;
2758 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2760 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2763 /* one more node has responded with recmode data*/
2766 /* if we failed to get the recmode, then return an error and let
2767 the main loop try again.
2769 if (state->state != CTDB_CONTROL_DONE) {
2770 if (rmdata->status == MONITOR_OK) {
2771 rmdata->status = MONITOR_FAILED;
2776 /* if we got a response, then the recmode will be stored in the
2779 if (state->status != CTDB_RECOVERY_NORMAL) {
2780 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2781 rmdata->status = MONITOR_RECOVERY_NEEDED;
2788 /* verify that all nodes are in normal recovery mode */
2789 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2791 struct verify_recmode_normal_data *rmdata;
2792 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2793 struct ctdb_client_control_state *state;
2794 enum monitor_result status;
2797 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2798 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2800 rmdata->status = MONITOR_OK;
2802 /* loop over all active nodes and send an async getrecmode call to
2804 for (j=0; j<nodemap->num; j++) {
2805 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2808 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2810 nodemap->nodes[j].pnn);
2811 if (state == NULL) {
2812 /* we failed to send the control, treat this as
2813 an error and try again next iteration
2815 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2816 talloc_free(mem_ctx);
2817 return MONITOR_FAILED;
2820 /* set up the callback functions */
2821 state->async.fn = verify_recmode_normal_callback;
2822 state->async.private_data = rmdata;
2824 /* one more control to wait for to complete */
2829 /* now wait for up to the maximum number of seconds allowed
2830 or until all nodes we expect a response from has replied
2832 while (rmdata->count > 0) {
2833 event_loop_once(ctdb->ev);
2836 status = rmdata->status;
2837 talloc_free(mem_ctx);
2842 struct verify_recmaster_data {
2843 struct ctdb_recoverd *rec;
2846 enum monitor_result status;
2849 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2851 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2854 /* one more node has responded with recmaster data*/
2857 /* if we failed to get the recmaster, then return an error and let
2858 the main loop try again.
2860 if (state->state != CTDB_CONTROL_DONE) {
2861 if (rmdata->status == MONITOR_OK) {
2862 rmdata->status = MONITOR_FAILED;
2867 /* if we got a response, then the recmaster will be stored in the
2870 if (state->status != rmdata->pnn) {
2871 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2872 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2873 rmdata->status = MONITOR_ELECTION_NEEDED;
2880 /* verify that all nodes agree that we are the recmaster */
2881 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2883 struct ctdb_context *ctdb = rec->ctdb;
2884 struct verify_recmaster_data *rmdata;
2885 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2886 struct ctdb_client_control_state *state;
2887 enum monitor_result status;
2890 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2891 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2895 rmdata->status = MONITOR_OK;
2897 /* loop over all active nodes and send an async getrecmaster call to
2899 for (j=0; j<nodemap->num; j++) {
2900 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2903 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2905 nodemap->nodes[j].pnn);
2906 if (state == NULL) {
2907 /* we failed to send the control, treat this as
2908 an error and try again next iteration
2910 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2911 talloc_free(mem_ctx);
2912 return MONITOR_FAILED;
2915 /* set up the callback functions */
2916 state->async.fn = verify_recmaster_callback;
2917 state->async.private_data = rmdata;
2919 /* one more control to wait for to complete */
2924 /* now wait for up to the maximum number of seconds allowed
2925 or until all nodes we expect a response from has replied
2927 while (rmdata->count > 0) {
2928 event_loop_once(ctdb->ev);
2931 status = rmdata->status;
2932 talloc_free(mem_ctx);
2936 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2937 struct ctdb_recoverd *rec)
2939 struct ctdb_control_get_ifaces *ifaces = NULL;
2940 TALLOC_CTX *mem_ctx;
2943 mem_ctx = talloc_new(NULL);
2945 /* Read the interfaces from the local node */
2946 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2947 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2948 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2949 /* We could return an error. However, this will be
2950 * rare so we'll decide that the interfaces have
2951 * actually changed, just in case.
2953 talloc_free(mem_ctx);
2958 /* We haven't been here before so things have changed */
2959 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2961 } else if (rec->ifaces->num != ifaces->num) {
2962 /* Number of interfaces has changed */
2963 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2964 rec->ifaces->num, ifaces->num));
2967 /* See if interface names or link states have changed */
2969 for (i = 0; i < rec->ifaces->num; i++) {
2970 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
2971 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2973 ("Interface in slot %d changed: %s => %s\n",
2974 i, iface->name, ifaces->ifaces[i].name));
2978 if (iface->link_state != ifaces->ifaces[i].link_state) {
2980 ("Interface %s changed state: %d => %d\n",
2981 iface->name, iface->link_state,
2982 ifaces->ifaces[i].link_state));
2989 talloc_free(rec->ifaces);
2990 rec->ifaces = talloc_steal(rec, ifaces);
2992 talloc_free(mem_ctx);
2996 /* called to check that the local allocation of public ip addresses is ok.
2998 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3000 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3001 struct ctdb_uptime *uptime1 = NULL;
3002 struct ctdb_uptime *uptime2 = NULL;
3004 bool need_takeover_run = false;
3006 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3007 CTDB_CURRENT_NODE, &uptime1);
3009 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3010 talloc_free(mem_ctx);
3014 if (interfaces_have_changed(ctdb, rec)) {
3015 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3016 "local node %u - force takeover run\n",
3018 need_takeover_run = true;
3021 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3022 CTDB_CURRENT_NODE, &uptime2);
3024 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3025 talloc_free(mem_ctx);
3029 /* skip the check if the startrecovery time has changed */
3030 if (timeval_compare(&uptime1->last_recovery_started,
3031 &uptime2->last_recovery_started) != 0) {
3032 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3033 talloc_free(mem_ctx);
3037 /* skip the check if the endrecovery time has changed */
3038 if (timeval_compare(&uptime1->last_recovery_finished,
3039 &uptime2->last_recovery_finished) != 0) {
3040 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3041 talloc_free(mem_ctx);
3045 /* skip the check if we have started but not finished recovery */
3046 if (timeval_compare(&uptime1->last_recovery_finished,
3047 &uptime1->last_recovery_started) != 1) {
3048 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3049 talloc_free(mem_ctx);
3054 /* verify that we have the ip addresses we should have
3055 and we dont have ones we shouldnt have.
3056 if we find an inconsistency we set recmode to
3057 active on the local node and wait for the recmaster
3058 to do a full blown recovery.
3059 also if the pnn is -1 and we are healthy and can host the ip
3060 we also request a ip reallocation.
3062 if (ctdb->tunable.disable_ip_failover == 0) {
3063 struct ctdb_all_public_ips *ips = NULL;
3065 /* read the *available* IPs from the local node */
3066 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3068 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3069 talloc_free(mem_ctx);
3073 for (j=0; j<ips->num; j++) {
3074 if (ips->ips[j].pnn == -1 &&
3075 nodemap->nodes[pnn].flags == 0) {
3076 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3077 ctdb_addr_to_str(&ips->ips[j].addr)));
3078 need_takeover_run = true;
3084 /* read the *known* IPs from the local node */
3085 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3087 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3088 talloc_free(mem_ctx);
3092 for (j=0; j<ips->num; j++) {
3093 if (ips->ips[j].pnn == pnn) {
3094 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3095 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3096 ctdb_addr_to_str(&ips->ips[j].addr)));
3097 need_takeover_run = true;
3100 if (ctdb->do_checkpublicip &&
3101 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3103 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3104 ctdb_addr_to_str(&ips->ips[j].addr)));
3106 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3107 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3114 if (need_takeover_run) {
3115 struct takeover_run_reply rd;
3118 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3122 data.dptr = (uint8_t *)&rd;
3123 data.dsize = sizeof(rd);
3125 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3127 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3130 talloc_free(mem_ctx);
3135 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3137 struct ctdb_node_map **remote_nodemaps = callback_data;
3139 if (node_pnn >= ctdb->num_nodes) {
3140 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3144 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3148 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3149 struct ctdb_node_map *nodemap,
3150 struct ctdb_node_map **remote_nodemaps)
3154 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3155 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3157 CONTROL_TIMEOUT(), false, tdb_null,
3158 async_getnodemap_callback,
3160 remote_nodemaps) != 0) {
3161 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3169 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3170 struct ctdb_check_reclock_state {
3171 struct ctdb_context *ctdb;
3172 struct timeval start_time;
3175 struct timed_event *te;
3176 struct fd_event *fde;
3177 enum reclock_child_status status;
3180 /* when we free the reclock state we must kill any child process.
3182 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3184 struct ctdb_context *ctdb = state->ctdb;
3186 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3188 if (state->fd[0] != -1) {
3189 close(state->fd[0]);
3192 if (state->fd[1] != -1) {
3193 close(state->fd[1]);
3196 ctdb_kill(ctdb, state->child, SIGKILL);
3201 called if our check_reclock child times out. this would happen if
3202 i/o to the reclock file blocks.
3204 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3205 struct timeval t, void *private_data)
3207 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3208 struct ctdb_check_reclock_state);
3210 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3211 state->status = RECLOCK_TIMEOUT;
3214 /* this is called when the child process has completed checking the reclock
3215 file and has written data back to us through the pipe.
3217 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3218 uint16_t flags, void *private_data)
3220 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3221 struct ctdb_check_reclock_state);
3225 /* we got a response from our child process so we can abort the
3228 talloc_free(state->te);
3231 ret = read(state->fd[0], &c, 1);
3232 if (ret != 1 || c != RECLOCK_OK) {
3233 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3234 state->status = RECLOCK_FAILED;
3239 state->status = RECLOCK_OK;
3243 static int check_recovery_lock(struct ctdb_context *ctdb)
3246 struct ctdb_check_reclock_state *state;
3247 pid_t parent = getpid();
3249 if (ctdb->recovery_lock_fd == -1) {
3250 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3254 state = talloc(ctdb, struct ctdb_check_reclock_state);
3255 CTDB_NO_MEMORY(ctdb, state);
3258 state->start_time = timeval_current();
3259 state->status = RECLOCK_CHECKING;
3263 ret = pipe(state->fd);
3266 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3270 state->child = ctdb_fork(ctdb);
3271 if (state->child == (pid_t)-1) {
3272 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3273 close(state->fd[0]);
3275 close(state->fd[1]);
3281 if (state->child == 0) {
3282 char cc = RECLOCK_OK;
3283 close(state->fd[0]);
3286 ctdb_set_process_name("ctdb_rec_reclock");
3287 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3288 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3289 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3290 cc = RECLOCK_FAILED;
3293 write(state->fd[1], &cc, 1);
3294 /* make sure we die when our parent dies */
3295 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3300 close(state->fd[1]);
3302 set_close_on_exec(state->fd[0]);
3304 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3306 talloc_set_destructor(state, check_reclock_destructor);
3308 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3309 ctdb_check_reclock_timeout, state);
3310 if (state->te == NULL) {
3311 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3316 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3318 reclock_child_handler,
3321 if (state->fde == NULL) {
3322 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3326 tevent_fd_set_auto_close(state->fde);
3328 while (state->status == RECLOCK_CHECKING) {
3329 event_loop_once(ctdb->ev);
3332 if (state->status == RECLOCK_FAILED) {
3333 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3334 close(ctdb->recovery_lock_fd);
3335 ctdb->recovery_lock_fd = -1;
3344 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3346 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3347 const char *reclockfile;
3349 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3350 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3351 talloc_free(tmp_ctx);
3355 if (reclockfile == NULL) {
3356 if (ctdb->recovery_lock_file != NULL) {
3357 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3358 talloc_free(ctdb->recovery_lock_file);
3359 ctdb->recovery_lock_file = NULL;
3360 if (ctdb->recovery_lock_fd != -1) {
3361 close(ctdb->recovery_lock_fd);
3362 ctdb->recovery_lock_fd = -1;
3365 ctdb->tunable.verify_recovery_lock = 0;
3366 talloc_free(tmp_ctx);
3370 if (ctdb->recovery_lock_file == NULL) {
3371 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3372 if (ctdb->recovery_lock_fd != -1) {
3373 close(ctdb->recovery_lock_fd);
3374 ctdb->recovery_lock_fd = -1;
3376 talloc_free(tmp_ctx);
3381 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3382 talloc_free(tmp_ctx);
3386 talloc_free(ctdb->recovery_lock_file);
3387 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3388 ctdb->tunable.verify_recovery_lock = 0;
3389 if (ctdb->recovery_lock_fd != -1) {
3390 close(ctdb->recovery_lock_fd);
3391 ctdb->recovery_lock_fd = -1;
3394 talloc_free(tmp_ctx);
3398 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3399 TALLOC_CTX *mem_ctx)
3402 struct ctdb_node_map *nodemap=NULL;
3403 struct ctdb_node_map *recmaster_nodemap=NULL;
3404 struct ctdb_node_map **remote_nodemaps=NULL;
3405 struct ctdb_vnn_map *vnnmap=NULL;
3406 struct ctdb_vnn_map *remote_vnnmap=NULL;
3407 int32_t debug_level;
3412 /* verify that the main daemon is still running */
3413 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3414 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3418 /* ping the local daemon to tell it we are alive */
3419 ctdb_ctrl_recd_ping(ctdb);
3421 if (rec->election_timeout) {
3422 /* an election is in progress */
3426 /* read the debug level from the parent and update locally */
3427 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3429 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3432 LogLevel = debug_level;
3434 /* get relevant tunables */
3435 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3437 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3441 /* get the current recovery lock file from the server */
3442 if (update_recovery_lock_file(ctdb) != 0) {
3443 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3447 /* Make sure that if recovery lock verification becomes disabled when
3450 if (ctdb->tunable.verify_recovery_lock == 0) {
3451 if (ctdb->recovery_lock_fd != -1) {
3452 close(ctdb->recovery_lock_fd);
3453 ctdb->recovery_lock_fd = -1;
3457 pnn = ctdb_get_pnn(ctdb);
3459 /* get the vnnmap */
3460 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3462 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3467 /* get number of nodes */
3469 talloc_free(rec->nodemap);
3470 rec->nodemap = NULL;
3473 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3475 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3478 nodemap = rec->nodemap;
3480 /* remember our own node flags */
3481 rec->node_flags = nodemap->nodes[pnn].flags;
3483 ban_misbehaving_nodes(rec, &self_ban);
3485 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3489 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3490 also frozen and that the recmode is set to active.
3492 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3493 /* If this node has become inactive then we want to
3494 * reduce the chances of it taking over the recovery
3495 * master role when it becomes active again. This
3496 * helps to stabilise the recovery master role so that
3497 * it stays on the most stable node.
3499 rec->priority_time = timeval_current();
3501 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3503 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3505 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3506 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3508 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3510 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3513 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3515 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3521 /* If this node is stopped or banned then it is not the recovery
3522 * master, so don't do anything. This prevents stopped or banned
3523 * node from starting election and sending unnecessary controls.
3528 /* check which node is the recovery master */
3529 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3531 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3535 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3536 if (rec->recmaster != pnn) {
3537 if (rec->ip_reallocate_ctx != NULL) {
3538 talloc_free(rec->ip_reallocate_ctx);
3539 rec->ip_reallocate_ctx = NULL;
3540 rec->reallocate_callers = NULL;
3544 /* This is a special case. When recovery daemon is started, recmaster
3545 * is set to -1. If a node is not started in stopped state, then
3546 * start election to decide recovery master
3548 if (rec->recmaster == (uint32_t)-1) {
3549 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3550 force_election(rec, pnn, nodemap);
3554 /* update the capabilities for all nodes */
3555 ret = update_capabilities(ctdb, nodemap);
3557 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3562 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3563 * but we have, then force an election and try to become the new
3566 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3567 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3568 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3569 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3570 " but we (node %u) have - force an election\n",
3571 rec->recmaster, pnn));
3572 force_election(rec, pnn, nodemap);
3576 /* count how many active nodes there are */
3577 rec->num_active = 0;
3578 rec->num_connected = 0;
3579 for (i=0; i<nodemap->num; i++) {
3580 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3583 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3584 rec->num_connected++;
3589 /* verify that the recmaster node is still active */
3590 for (j=0; j<nodemap->num; j++) {
3591 if (nodemap->nodes[j].pnn==rec->recmaster) {
3596 if (j == nodemap->num) {
3597 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3598 force_election(rec, pnn, nodemap);
3602 /* if recovery master is disconnected we must elect a new recmaster */
3603 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3604 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3605 force_election(rec, pnn, nodemap);
3609 /* get nodemap from the recovery master to check if it is inactive */
3610 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3611 mem_ctx, &recmaster_nodemap);
3613 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3614 nodemap->nodes[j].pnn));
3619 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3620 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3621 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3623 * update our nodemap to carry the recmaster's notion of
3624 * its own flags, so that we don't keep freezing the
3625 * inactive recmaster node...
3627 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3628 force_election(rec, pnn, nodemap);
3632 /* verify that we have all ip addresses we should have and we dont
3633 * have addresses we shouldnt have.
3635 if (ctdb->tunable.disable_ip_failover == 0) {
3636 if (rec->ip_check_disable_ctx == NULL) {
3637 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3638 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3644 /* if we are not the recmaster then we do not need to check
3645 if recovery is needed
3647 if (pnn != rec->recmaster) {
3652 /* ensure our local copies of flags are right */
3653 ret = update_local_flags(rec, nodemap);
3654 if (ret == MONITOR_ELECTION_NEEDED) {
3655 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3656 force_election(rec, pnn, nodemap);
3659 if (ret != MONITOR_OK) {
3660 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3664 if (ctdb->num_nodes != nodemap->num) {
3665 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3666 reload_nodes_file(ctdb);
3670 /* verify that all active nodes agree that we are the recmaster */
3671 switch (verify_recmaster(rec, nodemap, pnn)) {
3672 case MONITOR_RECOVERY_NEEDED:
3673 /* can not happen */
3675 case MONITOR_ELECTION_NEEDED:
3676 force_election(rec, pnn, nodemap);
3680 case MONITOR_FAILED:
3685 if (rec->need_recovery) {
3686 /* a previous recovery didn't finish */
3687 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3691 /* verify that all active nodes are in normal mode
3692 and not in recovery mode
3694 switch (verify_recmode(ctdb, nodemap)) {
3695 case MONITOR_RECOVERY_NEEDED:
3696 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3698 case MONITOR_FAILED:
3700 case MONITOR_ELECTION_NEEDED:
3701 /* can not happen */
3707 if (ctdb->tunable.verify_recovery_lock != 0) {
3708 /* we should have the reclock - check its not stale */
3709 ret = check_recovery_lock(ctdb);
3711 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3712 ctdb_set_culprit(rec, ctdb->pnn);
3713 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3719 /* is there a pending reload all ips ? */
3720 if (reload_all_ips_request != NULL) {
3721 reload_all_ips(ctdb, rec, nodemap, reload_all_ips_request);
3722 talloc_free(reload_all_ips_request);
3723 reload_all_ips_request = NULL;
3726 /* if there are takeovers requested, perform it and notify the waiters */
3727 if (rec->reallocate_callers) {
3728 process_ipreallocate_requests(ctdb, rec);
3731 /* get the nodemap for all active remote nodes
3733 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3734 if (remote_nodemaps == NULL) {
3735 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3738 for(i=0; i<nodemap->num; i++) {
3739 remote_nodemaps[i] = NULL;
3741 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3742 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3746 /* verify that all other nodes have the same nodemap as we have
3748 for (j=0; j<nodemap->num; j++) {
3749 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3753 if (remote_nodemaps[j] == NULL) {
3754 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3755 ctdb_set_culprit(rec, j);
3760 /* if the nodes disagree on how many nodes there are
3761 then this is a good reason to try recovery
3763 if (remote_nodemaps[j]->num != nodemap->num) {
3764 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3765 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3766 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3767 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3771 /* if the nodes disagree on which nodes exist and are
3772 active, then that is also a good reason to do recovery
3774 for (i=0;i<nodemap->num;i++) {
3775 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3776 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3777 nodemap->nodes[j].pnn, i,
3778 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3779 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3780 do_recovery(rec, mem_ctx, pnn, nodemap,
3788 * Update node flags obtained from each active node. This ensure we have
3789 * up-to-date information for all the nodes.
3791 for (j=0; j<nodemap->num; j++) {
3792 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3795 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3798 for (j=0; j<nodemap->num; j++) {
3799 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3803 /* verify the flags are consistent
3805 for (i=0; i<nodemap->num; i++) {
3806 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3810 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3811 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3812 nodemap->nodes[j].pnn,
3813 nodemap->nodes[i].pnn,
3814 remote_nodemaps[j]->nodes[i].flags,
3815 nodemap->nodes[i].flags));
3817 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3818 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3819 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3820 do_recovery(rec, mem_ctx, pnn, nodemap,
3824 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3825 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3826 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3827 do_recovery(rec, mem_ctx, pnn, nodemap,
3836 /* there better be the same number of lmasters in the vnn map
3837 as there are active nodes or we will have to do a recovery
3839 if (vnnmap->size != rec->num_active) {
3840 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3841 vnnmap->size, rec->num_active));
3842 ctdb_set_culprit(rec, ctdb->pnn);
3843 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3847 /* verify that all active nodes in the nodemap also exist in
3850 for (j=0; j<nodemap->num; j++) {
3851 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3854 if (nodemap->nodes[j].pnn == pnn) {
3858 for (i=0; i<vnnmap->size; i++) {
3859 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3863 if (i == vnnmap->size) {
3864 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3865 nodemap->nodes[j].pnn));
3866 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3867 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3873 /* verify that all other nodes have the same vnnmap
3874 and are from the same generation
3876 for (j=0; j<nodemap->num; j++) {
3877 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3880 if (nodemap->nodes[j].pnn == pnn) {
3884 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3885 mem_ctx, &remote_vnnmap);
3887 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3888 nodemap->nodes[j].pnn));
3892 /* verify the vnnmap generation is the same */
3893 if (vnnmap->generation != remote_vnnmap->generation) {
3894 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3895 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3896 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3897 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3901 /* verify the vnnmap size is the same */
3902 if (vnnmap->size != remote_vnnmap->size) {
3903 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3904 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3905 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3906 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3910 /* verify the vnnmap is the same */
3911 for (i=0;i<vnnmap->size;i++) {
3912 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3913 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3914 nodemap->nodes[j].pnn));
3915 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3916 do_recovery(rec, mem_ctx, pnn, nodemap,
3923 /* we might need to change who has what IP assigned */
3924 if (rec->need_takeover_run) {
3925 uint32_t culprit = (uint32_t)-1;
3927 rec->need_takeover_run = false;
3929 /* update the list of public ips that a node can handle for
3932 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3934 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3936 rec->need_takeover_run = true;
3940 /* execute the "startrecovery" event script on all nodes */
3941 ret = run_startrecovery_eventscript(rec, nodemap);
3943 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3944 ctdb_set_culprit(rec, ctdb->pnn);
3945 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3949 /* If takeover run fails, then the offending nodes are
3950 * assigned ban culprit counts. And we re-try takeover.
3951 * If takeover run fails repeatedly, the node would get
3954 * If rec->need_takeover_run is not set to true at this
3955 * failure, monitoring is disabled cluster-wide (via
3956 * startrecovery eventscript) and will not get enabled.
3958 if (!do_takeover_run(rec, nodemap, true)) {
3962 /* execute the "recovered" event script on all nodes */
3963 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3965 // we cant check whether the event completed successfully
3966 // since this script WILL fail if the node is in recovery mode
3967 // and if that race happens, the code here would just cause a second
3968 // cascading recovery.
3970 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3971 ctdb_set_culprit(rec, ctdb->pnn);
3972 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3979 the main monitoring loop
3981 static void monitor_cluster(struct ctdb_context *ctdb)
3983 struct ctdb_recoverd *rec;
3985 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3987 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3988 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3992 rec->takeover_run_in_progress = false;
3994 rec->priority_time = timeval_current();
3996 /* register a message port for sending memory dumps */
3997 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3999 /* register a message port for requesting logs */
4000 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
4002 /* register a message port for clearing logs */
4003 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
4005 /* register a message port for recovery elections */
4006 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4008 /* when nodes are disabled/enabled */
4009 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4011 /* when we are asked to puch out a flag change */
4012 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4014 /* register a message port for vacuum fetch */
4015 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4017 /* register a message port for reloadnodes */
4018 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4020 /* register a message port for performing a takeover run */
4021 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4023 /* register a message port for performing a reload all ips */
4024 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_ALL_IPS, ip_reloadall_handler, rec);
4026 /* register a message port for disabling the ip check for a short while */
4027 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4029 /* register a message port for updating the recovery daemons node assignment for an ip */
4030 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4032 /* register a message port for forcing a rebalance of a node next
4034 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4037 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4038 struct timeval start;
4042 DEBUG(DEBUG_CRIT,(__location__
4043 " Failed to create temp context\n"));
4047 start = timeval_current();
4048 main_loop(ctdb, rec, mem_ctx);
4049 talloc_free(mem_ctx);
4051 /* we only check for recovery once every second */
4052 elapsed = timeval_elapsed(&start);
4053 if (elapsed < ctdb->tunable.recover_interval) {
4054 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4061 event handler for when the main ctdbd dies
4063 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4064 uint16_t flags, void *private_data)
4066 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4071 called regularly to verify that the recovery daemon is still running
4073 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4074 struct timeval yt, void *p)
4076 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4078 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4079 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4081 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4082 ctdb_restart_recd, ctdb);
4087 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4088 timeval_current_ofs(30, 0),
4089 ctdb_check_recd, ctdb);
4092 static void recd_sig_child_handler(struct event_context *ev,
4093 struct signal_event *se, int signum, int count,
4097 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4102 pid = waitpid(-1, &status, WNOHANG);
4104 if (errno != ECHILD) {
4105 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4110 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4116 startup the recovery daemon as a child of the main ctdb daemon
4118 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4121 struct signal_event *se;
4122 struct tevent_fd *fde;
4124 if (pipe(fd) != 0) {
4128 ctdb->ctdbd_pid = getpid();
4130 ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4131 if (ctdb->recoverd_pid == -1) {
4135 if (ctdb->recoverd_pid != 0) {
4136 talloc_free(ctdb->recd_ctx);
4137 ctdb->recd_ctx = talloc_new(ctdb);
4138 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4141 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4142 timeval_current_ofs(30, 0),
4143 ctdb_check_recd, ctdb);
4149 srandom(getpid() ^ time(NULL));
4151 /* Clear the log ringbuffer */
4152 ctdb_clear_log(ctdb);
4154 ctdb_set_process_name("ctdb_recovered");
4155 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4156 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4160 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4162 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4163 ctdb_recoverd_parent, &fd[0]);
4164 tevent_fd_set_auto_close(fde);
4166 /* set up a handler to pick up sigchld */
4167 se = event_add_signal(ctdb->ev, ctdb,
4169 recd_sig_child_handler,
4172 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4176 monitor_cluster(ctdb);
4178 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4183 shutdown the recovery daemon
4185 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4187 if (ctdb->recoverd_pid == 0) {
4191 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4192 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4194 TALLOC_FREE(ctdb->recd_ctx);
4195 TALLOC_FREE(ctdb->recd_ping_count);
4198 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4199 struct timeval t, void *private_data)
4201 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4203 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4204 ctdb_stop_recoverd(ctdb);
4205 ctdb_start_recoverd(ctdb);