4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* List of SRVID requests that need to be processed */
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
96 t = talloc_zero(*requests, struct srvid_list);
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 struct ctdb_banning_state {
122 struct timeval last_reported_time;
126 private state of recovery daemon
128 struct ctdb_recoverd {
129 struct ctdb_context *ctdb;
132 uint32_t num_lmasters;
133 uint32_t num_connected;
134 uint32_t last_culprit_node;
135 struct ctdb_node_map *nodemap;
136 struct timeval priority_time;
137 bool need_takeover_run;
140 struct timed_event *send_election_te;
141 struct timed_event *election_timeout;
142 struct vacuum_info *vacuum_info;
143 struct srvid_requests *reallocate_requests;
144 bool takeover_run_in_progress;
145 TALLOC_CTX *takeover_runs_disable_ctx;
146 struct ctdb_control_get_ifaces *ifaces;
147 uint32_t *force_rebalance_nodes;
150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
156 ban a node for a period of time
158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
161 struct ctdb_context *ctdb = rec->ctdb;
162 struct ctdb_ban_time bantime;
164 if (!ctdb_validate_pnn(ctdb, pnn)) {
165 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
169 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
172 bantime.time = ban_time;
174 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
176 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
186 remember the trouble maker
188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
190 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
191 struct ctdb_banning_state *ban_state;
193 if (culprit > ctdb->num_nodes) {
194 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
198 /* If we are banned or stopped, do not set other nodes as culprits */
199 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
200 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
204 if (ctdb->nodes[culprit]->ban_state == NULL) {
205 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
206 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
210 ban_state = ctdb->nodes[culprit]->ban_state;
211 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
212 /* this was the first time in a long while this node
213 misbehaved so we will forgive any old transgressions.
215 ban_state->count = 0;
218 ban_state->count += count;
219 ban_state->last_reported_time = timeval_current();
220 rec->last_culprit_node = culprit;
224 remember the trouble maker
226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
228 ctdb_set_culprit_count(rec, culprit, 1);
232 /* this callback is called for every node that failed to execute the
235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
237 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
239 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
241 ctdb_set_culprit(rec, node_pnn);
245 run the "recovered" eventscript on all nodes
247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
251 struct ctdb_context *ctdb = rec->ctdb;
253 tmp_ctx = talloc_new(ctdb);
254 CTDB_NO_MEMORY(ctdb, tmp_ctx);
256 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
257 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
259 CONTROL_TIMEOUT(), false, tdb_null,
260 NULL, recovered_fail_callback,
262 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
264 talloc_free(tmp_ctx);
268 talloc_free(tmp_ctx);
272 /* this callback is called for every node that failed to execute the
275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
277 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
279 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
281 ctdb_set_culprit(rec, node_pnn);
285 run the "startrecovery" eventscript on all nodes
287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
291 struct ctdb_context *ctdb = rec->ctdb;
293 tmp_ctx = talloc_new(ctdb);
294 CTDB_NO_MEMORY(ctdb, tmp_ctx);
296 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
297 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
299 CONTROL_TIMEOUT(), false, tdb_null,
301 startrecovery_fail_callback,
303 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
304 talloc_free(tmp_ctx);
308 talloc_free(tmp_ctx);
312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
314 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
315 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
318 if (node_pnn < ctdb->num_nodes) {
319 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
322 if (node_pnn == ctdb->pnn) {
323 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
328 update the node capabilities for all connected nodes
330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
335 tmp_ctx = talloc_new(ctdb);
336 CTDB_NO_MEMORY(ctdb, tmp_ctx);
338 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
339 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
343 async_getcap_callback, NULL,
345 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
346 talloc_free(tmp_ctx);
350 talloc_free(tmp_ctx);
354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
356 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
358 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
359 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
364 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
366 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
367 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
371 change recovery mode on all nodes
373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
379 tmp_ctx = talloc_new(ctdb);
380 CTDB_NO_MEMORY(ctdb, tmp_ctx);
382 /* freeze all nodes */
383 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
384 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
387 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
388 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
393 set_recmode_fail_callback,
395 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
396 talloc_free(tmp_ctx);
403 data.dsize = sizeof(uint32_t);
404 data.dptr = (unsigned char *)&rec_mode;
406 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
412 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
413 talloc_free(tmp_ctx);
417 talloc_free(tmp_ctx);
422 change recovery master on all node
424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
430 tmp_ctx = talloc_new(ctdb);
431 CTDB_NO_MEMORY(ctdb, tmp_ctx);
433 data.dsize = sizeof(uint32_t);
434 data.dptr = (unsigned char *)&pnn;
436 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
437 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
439 CONTROL_TIMEOUT(), false, data,
442 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
443 talloc_free(tmp_ctx);
447 talloc_free(tmp_ctx);
451 /* update all remote nodes to use the same db priority that we have
452 this can fail if the remove node has not yet been upgraded to
453 support this function, so we always return success and never fail
454 a recovery if this call fails.
456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
457 struct ctdb_node_map *nodemap,
458 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
463 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
465 /* step through all local databases */
466 for (db=0; db<dbmap->num;db++) {
468 struct ctdb_db_priority db_prio;
471 db_prio.db_id = dbmap->dbs[db].dbid;
472 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
474 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
478 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
480 data.dptr = (uint8_t *)&db_prio;
481 data.dsize = sizeof(db_prio);
483 if (ctdb_client_async_control(ctdb,
484 CTDB_CONTROL_SET_DB_PRIORITY,
486 CONTROL_TIMEOUT(), false, data,
489 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
497 ensure all other nodes have attached to any databases that we have
499 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
500 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
503 struct ctdb_dbid_map *remote_dbmap;
505 /* verify that all other nodes have all our databases */
506 for (j=0; j<nodemap->num; j++) {
507 /* we dont need to ourself ourselves */
508 if (nodemap->nodes[j].pnn == pnn) {
511 /* dont check nodes that are unavailable */
512 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
516 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
517 mem_ctx, &remote_dbmap);
519 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
523 /* step through all local databases */
524 for (db=0; db<dbmap->num;db++) {
528 for (i=0;i<remote_dbmap->num;i++) {
529 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
533 /* the remote node already have this database */
534 if (i!=remote_dbmap->num) {
537 /* ok so we need to create this database */
538 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
541 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
544 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
546 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
548 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
559 ensure we are attached to any databases that anyone else is attached to
561 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
562 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
565 struct ctdb_dbid_map *remote_dbmap;
567 /* verify that we have all database any other node has */
568 for (j=0; j<nodemap->num; j++) {
569 /* we dont need to ourself ourselves */
570 if (nodemap->nodes[j].pnn == pnn) {
573 /* dont check nodes that are unavailable */
574 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
578 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
579 mem_ctx, &remote_dbmap);
581 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
585 /* step through all databases on the remote node */
586 for (db=0; db<remote_dbmap->num;db++) {
589 for (i=0;i<(*dbmap)->num;i++) {
590 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
594 /* we already have this db locally */
595 if (i!=(*dbmap)->num) {
598 /* ok so we need to create this database and
601 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
602 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
604 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
605 nodemap->nodes[j].pnn));
608 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
609 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
611 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
614 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
616 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
627 pull the remote database contents from one node into the recdb
629 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
630 struct tdb_wrap *recdb, uint32_t dbid)
634 struct ctdb_marshall_buffer *reply;
635 struct ctdb_rec_data *rec;
637 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
639 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
640 CONTROL_TIMEOUT(), &outdata);
642 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
643 talloc_free(tmp_ctx);
647 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
649 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
650 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
651 talloc_free(tmp_ctx);
655 rec = (struct ctdb_rec_data *)&reply->data[0];
659 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
661 struct ctdb_ltdb_header *hdr;
664 key.dptr = &rec->data[0];
665 key.dsize = rec->keylen;
666 data.dptr = &rec->data[key.dsize];
667 data.dsize = rec->datalen;
669 hdr = (struct ctdb_ltdb_header *)data.dptr;
671 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
672 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
673 talloc_free(tmp_ctx);
677 /* fetch the existing record, if any */
678 existing = tdb_fetch(recdb->tdb, key);
680 if (existing.dptr != NULL) {
681 struct ctdb_ltdb_header header;
682 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
683 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
684 (unsigned)existing.dsize, srcnode));
686 talloc_free(tmp_ctx);
689 header = *(struct ctdb_ltdb_header *)existing.dptr;
691 if (!(header.rsn < hdr->rsn ||
692 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
697 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
698 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
699 talloc_free(tmp_ctx);
704 talloc_free(tmp_ctx);
710 struct pull_seqnum_cbdata {
716 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
718 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
721 if (cb_data->failed != 0) {
722 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
727 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
732 if (outdata.dsize != sizeof(uint64_t)) {
733 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
734 cb_data->failed = -1;
738 seqnum = *((uint64_t *)outdata.dptr);
740 if (seqnum > cb_data->seqnum) {
741 cb_data->seqnum = seqnum;
742 cb_data->pnn = node_pnn;
746 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
748 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
750 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
754 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
755 struct ctdb_recoverd *rec,
756 struct ctdb_node_map *nodemap,
757 struct tdb_wrap *recdb, uint32_t dbid)
759 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
763 struct pull_seqnum_cbdata *cb_data;
765 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
770 data.dsize = sizeof(outdata);
771 data.dptr = (uint8_t *)&outdata[0];
773 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
774 if (cb_data == NULL) {
775 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
776 talloc_free(tmp_ctx);
784 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
785 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
787 CONTROL_TIMEOUT(), false, data,
791 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
793 talloc_free(tmp_ctx);
797 if (cb_data->failed != 0) {
798 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
799 talloc_free(tmp_ctx);
803 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
804 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
805 talloc_free(tmp_ctx);
809 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
811 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
812 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
813 talloc_free(tmp_ctx);
817 talloc_free(tmp_ctx);
823 pull all the remote database contents into the recdb
825 static int pull_remote_database(struct ctdb_context *ctdb,
826 struct ctdb_recoverd *rec,
827 struct ctdb_node_map *nodemap,
828 struct tdb_wrap *recdb, uint32_t dbid,
833 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
835 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
841 /* pull all records from all other nodes across onto this node
842 (this merges based on rsn)
844 for (j=0; j<nodemap->num; j++) {
845 /* dont merge from nodes that are unavailable */
846 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
849 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
850 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
851 nodemap->nodes[j].pnn));
852 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
862 update flags on all active nodes
864 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
868 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
870 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
878 ensure all nodes have the same vnnmap we do
880 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
881 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
885 /* push the new vnn map out to all the nodes */
886 for (j=0; j<nodemap->num; j++) {
887 /* dont push to nodes that are unavailable */
888 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
892 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
894 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
904 struct vacuum_info *next, *prev;
905 struct ctdb_recoverd *rec;
907 struct ctdb_db_context *ctdb_db;
908 struct ctdb_marshall_buffer *recs;
909 struct ctdb_rec_data *r;
912 static void vacuum_fetch_next(struct vacuum_info *v);
915 called when a vacuum fetch has completed - just free it and do the next one
917 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
919 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
921 vacuum_fetch_next(v);
926 process the next element from the vacuum list
928 static void vacuum_fetch_next(struct vacuum_info *v)
930 struct ctdb_call call;
931 struct ctdb_rec_data *r;
933 while (v->recs->count) {
934 struct ctdb_client_call_state *state;
936 struct ctdb_ltdb_header *hdr;
939 call.call_id = CTDB_NULL_FUNC;
940 call.flags = CTDB_IMMEDIATE_MIGRATION;
941 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
944 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
947 call.key.dptr = &r->data[0];
948 call.key.dsize = r->keylen;
950 /* ensure we don't block this daemon - just skip a record if we can't get
952 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
956 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
957 if (data.dptr == NULL) {
958 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
962 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
964 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
968 hdr = (struct ctdb_ltdb_header *)data.dptr;
969 if (hdr->dmaster == v->rec->ctdb->pnn) {
970 /* its already local */
972 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
978 state = ctdb_call_send(v->ctdb_db, &call);
979 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
981 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
985 state->async.fn = vacuum_fetch_callback;
986 state->async.private_data = v;
995 destroy a vacuum info structure
997 static int vacuum_info_destructor(struct vacuum_info *v)
999 DLIST_REMOVE(v->rec->vacuum_info, v);
1005 handler for vacuum fetch
1007 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1008 TDB_DATA data, void *private_data)
1010 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1011 struct ctdb_marshall_buffer *recs;
1013 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1015 struct ctdb_dbid_map *dbmap=NULL;
1016 bool persistent = false;
1017 struct ctdb_db_context *ctdb_db;
1018 struct ctdb_rec_data *r;
1020 struct vacuum_info *v;
1022 recs = (struct ctdb_marshall_buffer *)data.dptr;
1023 r = (struct ctdb_rec_data *)&recs->data[0];
1025 if (recs->count == 0) {
1026 talloc_free(tmp_ctx);
1032 for (v=rec->vacuum_info;v;v=v->next) {
1033 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1034 /* we're already working on records from this node */
1035 talloc_free(tmp_ctx);
1040 /* work out if the database is persistent */
1041 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1043 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1044 talloc_free(tmp_ctx);
1048 for (i=0;i<dbmap->num;i++) {
1049 if (dbmap->dbs[i].dbid == recs->db_id) {
1050 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1054 if (i == dbmap->num) {
1055 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1056 talloc_free(tmp_ctx);
1060 /* find the name of this database */
1061 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1062 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1063 talloc_free(tmp_ctx);
1068 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1069 if (ctdb_db == NULL) {
1070 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1071 talloc_free(tmp_ctx);
1075 v = talloc_zero(rec, struct vacuum_info);
1077 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1078 talloc_free(tmp_ctx);
1083 v->srcnode = srcnode;
1084 v->ctdb_db = ctdb_db;
1085 v->recs = talloc_memdup(v, recs, data.dsize);
1086 if (v->recs == NULL) {
1087 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1089 talloc_free(tmp_ctx);
1092 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1094 DLIST_ADD(rec->vacuum_info, v);
1096 talloc_set_destructor(v, vacuum_info_destructor);
1098 vacuum_fetch_next(v);
1099 talloc_free(tmp_ctx);
1104 called when ctdb_wait_timeout should finish
1106 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1107 struct timeval yt, void *p)
1109 uint32_t *timed_out = (uint32_t *)p;
1114 wait for a given number of seconds
1116 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1118 uint32_t timed_out = 0;
1119 time_t usecs = (secs - (time_t)secs) * 1000000;
1120 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1121 while (!timed_out) {
1122 event_loop_once(ctdb->ev);
1127 called when an election times out (ends)
1129 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1130 struct timeval t, void *p)
1132 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1133 rec->election_timeout = NULL;
1136 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1141 wait for an election to finish. It finished election_timeout seconds after
1142 the last election packet is received
1144 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1146 struct ctdb_context *ctdb = rec->ctdb;
1147 while (rec->election_timeout) {
1148 event_loop_once(ctdb->ev);
1153 Update our local flags from all remote connected nodes.
1154 This is only run when we are or we belive we are the recovery master
1156 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1159 struct ctdb_context *ctdb = rec->ctdb;
1160 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1162 /* get the nodemap for all active remote nodes and verify
1163 they are the same as for this node
1165 for (j=0; j<nodemap->num; j++) {
1166 struct ctdb_node_map *remote_nodemap=NULL;
1169 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1172 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1176 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1177 mem_ctx, &remote_nodemap);
1179 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1180 nodemap->nodes[j].pnn));
1181 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1182 talloc_free(mem_ctx);
1183 return MONITOR_FAILED;
1185 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1186 /* We should tell our daemon about this so it
1187 updates its flags or else we will log the same
1188 message again in the next iteration of recovery.
1189 Since we are the recovery master we can just as
1190 well update the flags on all nodes.
1192 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1194 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1198 /* Update our local copy of the flags in the recovery
1201 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1202 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1203 nodemap->nodes[j].flags));
1204 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1206 talloc_free(remote_nodemap);
1208 talloc_free(mem_ctx);
1213 /* Create a new random generation ip.
1214 The generation id can not be the INVALID_GENERATION id
1216 static uint32_t new_generation(void)
1218 uint32_t generation;
1221 generation = random();
1223 if (generation != INVALID_GENERATION) {
1233 create a temporary working database
1235 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1238 struct tdb_wrap *recdb;
1241 /* open up the temporary recovery database */
1242 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1243 ctdb->db_directory_state,
1250 tdb_flags = TDB_NOLOCK;
1251 if (ctdb->valgrinding) {
1252 tdb_flags |= TDB_NOMMAP;
1254 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1256 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1257 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1258 if (recdb == NULL) {
1259 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1269 a traverse function for pulling all relevant records from recdb
1272 struct ctdb_context *ctdb;
1273 struct ctdb_marshall_buffer *recdata;
1275 uint32_t allocated_len;
1280 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1282 struct recdb_data *params = (struct recdb_data *)p;
1283 struct ctdb_rec_data *rec;
1284 struct ctdb_ltdb_header *hdr;
1287 * skip empty records - but NOT for persistent databases:
1289 * The record-by-record mode of recovery deletes empty records.
1290 * For persistent databases, this can lead to data corruption
1291 * by deleting records that should be there:
1293 * - Assume the cluster has been running for a while.
1295 * - A record R in a persistent database has been created and
1296 * deleted a couple of times, the last operation being deletion,
1297 * leaving an empty record with a high RSN, say 10.
1299 * - Now a node N is turned off.
1301 * - This leaves the local database copy of D on N with the empty
1302 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1303 * the copy of record R.
1305 * - Now the record is created again while node N is turned off.
1306 * This creates R with RSN = 1 on all nodes except for N.
1308 * - Now node N is turned on again. The following recovery will chose
1309 * the older empty copy of R due to RSN 10 > RSN 1.
1311 * ==> Hence the record is gone after the recovery.
1313 * On databases like Samba's registry, this can damage the higher-level
1314 * data structures built from the various tdb-level records.
1316 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1320 /* update the dmaster field to point to us */
1321 hdr = (struct ctdb_ltdb_header *)data.dptr;
1322 if (!params->persistent) {
1323 hdr->dmaster = params->ctdb->pnn;
1324 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1327 /* add the record to the blob ready to send to the nodes */
1328 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1330 params->failed = true;
1333 if (params->len + rec->length >= params->allocated_len) {
1334 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1335 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1337 if (params->recdata == NULL) {
1338 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1339 rec->length + params->len));
1340 params->failed = true;
1343 params->recdata->count++;
1344 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1345 params->len += rec->length;
1352 push the recdb database out to all nodes
1354 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1356 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1358 struct recdb_data params;
1359 struct ctdb_marshall_buffer *recdata;
1361 TALLOC_CTX *tmp_ctx;
1364 tmp_ctx = talloc_new(ctdb);
1365 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1367 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1368 CTDB_NO_MEMORY(ctdb, recdata);
1370 recdata->db_id = dbid;
1373 params.recdata = recdata;
1374 params.len = offsetof(struct ctdb_marshall_buffer, data);
1375 params.allocated_len = params.len;
1376 params.failed = false;
1377 params.persistent = persistent;
1379 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1380 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1381 talloc_free(params.recdata);
1382 talloc_free(tmp_ctx);
1386 if (params.failed) {
1387 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1388 talloc_free(params.recdata);
1389 talloc_free(tmp_ctx);
1393 recdata = params.recdata;
1395 outdata.dptr = (void *)recdata;
1396 outdata.dsize = params.len;
1398 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1399 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1401 CONTROL_TIMEOUT(), false, outdata,
1404 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1405 talloc_free(recdata);
1406 talloc_free(tmp_ctx);
1410 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1411 dbid, recdata->count));
1413 talloc_free(recdata);
1414 talloc_free(tmp_ctx);
1421 go through a full recovery on one database
1423 static int recover_database(struct ctdb_recoverd *rec,
1424 TALLOC_CTX *mem_ctx,
1428 struct ctdb_node_map *nodemap,
1429 uint32_t transaction_id)
1431 struct tdb_wrap *recdb;
1433 struct ctdb_context *ctdb = rec->ctdb;
1435 struct ctdb_control_wipe_database w;
1438 recdb = create_recdb(ctdb, mem_ctx);
1439 if (recdb == NULL) {
1443 /* pull all remote databases onto the recdb */
1444 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1446 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1450 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1452 /* wipe all the remote databases. This is safe as we are in a transaction */
1454 w.transaction_id = transaction_id;
1456 data.dptr = (void *)&w;
1457 data.dsize = sizeof(w);
1459 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1460 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1462 CONTROL_TIMEOUT(), false, data,
1465 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1470 /* push out the correct database. This sets the dmaster and skips
1471 the empty records */
1472 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1478 /* all done with this database */
1484 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1485 struct ctdb_recoverd *rec,
1486 struct ctdb_node_map *nodemap,
1492 if (ctdb->num_nodes != nodemap->num) {
1493 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1494 ctdb->num_nodes, nodemap->num));
1496 *culprit = ctdb->pnn;
1501 for (j=0; j<nodemap->num; j++) {
1502 /* For readability */
1503 struct ctdb_node *node = ctdb->nodes[j];
1505 /* release any existing data */
1506 if (node->known_public_ips) {
1507 talloc_free(node->known_public_ips);
1508 node->known_public_ips = NULL;
1510 if (node->available_public_ips) {
1511 talloc_free(node->available_public_ips);
1512 node->available_public_ips = NULL;
1515 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1519 /* Retrieve the list of known public IPs from the node */
1520 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1525 &node->known_public_ips);
1528 ("Failed to read known public IPs from node: %u\n",
1531 *culprit = node->pnn;
1536 if (ctdb->do_checkpublicip &&
1537 rec->takeover_runs_disable_ctx == NULL &&
1538 verify_remote_ip_allocation(ctdb,
1539 node->known_public_ips,
1541 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1542 rec->need_takeover_run = true;
1545 /* Retrieve the list of available public IPs from the node */
1546 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1550 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1551 &node->available_public_ips);
1554 ("Failed to read available public IPs from node: %u\n",
1557 *culprit = node->pnn;
1566 /* when we start a recovery, make sure all nodes use the same reclock file
1569 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1571 struct ctdb_context *ctdb = rec->ctdb;
1572 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1576 if (ctdb->recovery_lock_file == NULL) {
1580 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1581 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1584 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1585 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1591 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1592 talloc_free(tmp_ctx);
1596 talloc_free(tmp_ctx);
1602 * this callback is called for every node that failed to execute ctdb_takeover_run()
1603 * and set flag to re-run takeover run.
1605 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1607 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1609 if (callback_data != NULL) {
1610 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1612 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1614 ctdb_set_culprit(rec, node_pnn);
1619 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1621 struct ctdb_context *ctdb = rec->ctdb;
1623 struct ctdb_banning_state *ban_state;
1626 for (i=0; i<ctdb->num_nodes; i++) {
1627 if (ctdb->nodes[i]->ban_state == NULL) {
1630 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1631 if (ban_state->count < 2*ctdb->num_nodes) {
1635 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1636 ctdb->nodes[i]->pnn, ban_state->count,
1637 ctdb->tunable.recovery_ban_period));
1638 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1639 ban_state->count = 0;
1641 /* Banning ourself? */
1642 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1648 static bool do_takeover_run(struct ctdb_recoverd *rec,
1649 struct ctdb_node_map *nodemap,
1650 bool banning_credits_on_fail)
1652 uint32_t *nodes = NULL;
1653 struct srvid_request dtr;
1656 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1660 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1662 if (rec->takeover_run_in_progress) {
1663 DEBUG(DEBUG_ERR, (__location__
1664 " takeover run already in progress \n"));
1669 rec->takeover_run_in_progress = true;
1671 /* If takeover runs are in disabled then fail... */
1672 if (rec->takeover_runs_disable_ctx != NULL) {
1674 ("Takeover runs are disabled so refusing to run one\n"));
1679 /* Disable IP checks (takeover runs, really) on other nodes
1680 * while doing this takeover run. This will stop those other
1681 * nodes from triggering takeover runs when think they should
1682 * be hosting an IP but it isn't yet on an interface. Don't
1683 * wait for replies since a failure here might cause some
1684 * noise in the logs but will not actually cause a problem.
1686 dtr.srvid = 0; /* No reply */
1689 data.dptr = (uint8_t*)&dtr;
1690 data.dsize = sizeof(dtr);
1692 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1694 /* Disable for 60 seconds. This can be a tunable later if
1698 for (i = 0; i < talloc_array_length(nodes); i++) {
1699 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1700 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1702 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1706 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1707 rec->force_rebalance_nodes,
1708 takeover_fail_callback,
1709 banning_credits_on_fail ? rec : NULL);
1711 /* Reenable takeover runs and IP checks on other nodes */
1713 for (i = 0; i < talloc_array_length(nodes); i++) {
1714 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1715 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1717 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1722 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1728 /* Takeover run was successful so clear force rebalance targets */
1729 if (rebalance_nodes == rec->force_rebalance_nodes) {
1730 TALLOC_FREE(rec->force_rebalance_nodes);
1732 DEBUG(DEBUG_WARNING,
1733 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1736 rec->need_takeover_run = !ok;
1738 rec->takeover_run_in_progress = false;
1740 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1746 we are the recmaster, and recovery is needed - start a recovery run
1748 static int do_recovery(struct ctdb_recoverd *rec,
1749 TALLOC_CTX *mem_ctx, uint32_t pnn,
1750 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1752 struct ctdb_context *ctdb = rec->ctdb;
1754 uint32_t generation;
1755 struct ctdb_dbid_map *dbmap;
1758 struct timeval start_time;
1759 uint32_t culprit = (uint32_t)-1;
1762 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1764 /* if recovery fails, force it again */
1765 rec->need_recovery = true;
1767 ban_misbehaving_nodes(rec, &self_ban);
1769 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1773 if (ctdb->tunable.verify_recovery_lock != 0) {
1774 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1775 start_time = timeval_current();
1776 if (!ctdb_recovery_lock(ctdb, true)) {
1777 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1778 "and ban ourself for %u seconds\n",
1779 ctdb->tunable.recovery_ban_period));
1780 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1783 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1784 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1787 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1789 /* get a list of all databases */
1790 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1792 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1796 /* we do the db creation before we set the recovery mode, so the freeze happens
1797 on all databases we will be dealing with. */
1799 /* verify that we have all the databases any other node has */
1800 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1802 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1806 /* verify that all other nodes have all our databases */
1807 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1809 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1812 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1814 /* update the database priority for all remote databases */
1815 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1817 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1819 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1822 /* update all other nodes to use the same setting for reclock files
1823 as the local recovery master.
1825 sync_recovery_lock_file_across_cluster(rec);
1827 /* set recovery mode to active on all nodes */
1828 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1830 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1834 /* execute the "startrecovery" event script on all nodes */
1835 ret = run_startrecovery_eventscript(rec, nodemap);
1837 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1842 update all nodes to have the same flags that we have
1844 for (i=0;i<nodemap->num;i++) {
1845 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1849 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1851 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1852 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1854 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1860 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1862 /* pick a new generation number */
1863 generation = new_generation();
1865 /* change the vnnmap on this node to use the new generation
1866 number but not on any other nodes.
1867 this guarantees that if we abort the recovery prematurely
1868 for some reason (a node stops responding?)
1869 that we can just return immediately and we will reenter
1870 recovery shortly again.
1871 I.e. we deliberately leave the cluster with an inconsistent
1872 generation id to allow us to abort recovery at any stage and
1873 just restart it from scratch.
1875 vnnmap->generation = generation;
1876 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1878 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1882 data.dptr = (void *)&generation;
1883 data.dsize = sizeof(uint32_t);
1885 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1886 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1888 CONTROL_TIMEOUT(), false, data,
1890 transaction_start_fail_callback,
1892 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1893 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1895 CONTROL_TIMEOUT(), false, tdb_null,
1899 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1904 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1906 for (i=0;i<dbmap->num;i++) {
1907 ret = recover_database(rec, mem_ctx,
1909 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1910 pnn, nodemap, generation);
1912 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1917 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1919 /* commit all the changes */
1920 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1922 CONTROL_TIMEOUT(), false, data,
1925 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1929 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1932 /* update the capabilities for all nodes */
1933 ret = update_capabilities(ctdb, nodemap);
1935 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1939 /* build a new vnn map with all the currently active and
1941 generation = new_generation();
1942 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1943 CTDB_NO_MEMORY(ctdb, vnnmap);
1944 vnnmap->generation = generation;
1946 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1947 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1948 for (i=j=0;i<nodemap->num;i++) {
1949 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1952 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1953 /* this node can not be an lmaster */
1954 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1959 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1960 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1961 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1964 if (vnnmap->size == 0) {
1965 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1967 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1968 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1969 vnnmap->map[0] = pnn;
1972 /* update to the new vnnmap on all nodes */
1973 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1975 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1979 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1981 /* update recmaster to point to us for all nodes */
1982 ret = set_recovery_master(ctdb, nodemap, pnn);
1984 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1988 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1991 update all nodes to have the same flags that we have
1993 for (i=0;i<nodemap->num;i++) {
1994 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1998 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2000 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2005 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2007 /* disable recovery mode */
2008 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2010 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2014 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2016 /* Fetch known/available public IPs from each active node */
2017 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2019 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2021 rec->need_takeover_run = true;
2025 do_takeover_run(rec, nodemap, false);
2027 /* execute the "recovered" event script on all nodes */
2028 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2030 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2034 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2036 /* send a message to all clients telling them that the cluster
2037 has been reconfigured */
2038 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
2040 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2042 rec->need_recovery = false;
2044 /* we managed to complete a full recovery, make sure to forgive
2045 any past sins by the nodes that could now participate in the
2048 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2049 for (i=0;i<nodemap->num;i++) {
2050 struct ctdb_banning_state *ban_state;
2052 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2056 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2057 if (ban_state == NULL) {
2061 ban_state->count = 0;
2065 /* We just finished a recovery successfully.
2066 We now wait for rerecovery_timeout before we allow
2067 another recovery to take place.
2069 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2070 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2071 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2078 elections are won by first checking the number of connected nodes, then
2079 the priority time, then the pnn
2081 struct election_message {
2082 uint32_t num_connected;
2083 struct timeval priority_time;
2085 uint32_t node_flags;
2089 form this nodes election data
2091 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2094 struct ctdb_node_map *nodemap;
2095 struct ctdb_context *ctdb = rec->ctdb;
2099 em->pnn = rec->ctdb->pnn;
2100 em->priority_time = rec->priority_time;
2102 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2104 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
2108 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2109 em->node_flags = rec->node_flags;
2111 for (i=0;i<nodemap->num;i++) {
2112 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2113 em->num_connected++;
2117 /* we shouldnt try to win this election if we cant be a recmaster */
2118 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2119 em->num_connected = 0;
2120 em->priority_time = timeval_current();
2123 talloc_free(nodemap);
2127 see if the given election data wins
2129 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2131 struct election_message myem;
2134 ctdb_election_data(rec, &myem);
2136 /* we cant win if we dont have the recmaster capability */
2137 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2141 /* we cant win if we are banned */
2142 if (rec->node_flags & NODE_FLAGS_BANNED) {
2146 /* we cant win if we are stopped */
2147 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2151 /* we will automatically win if the other node is banned */
2152 if (em->node_flags & NODE_FLAGS_BANNED) {
2156 /* we will automatically win if the other node is banned */
2157 if (em->node_flags & NODE_FLAGS_STOPPED) {
2161 /* try to use the most connected node */
2163 cmp = (int)myem.num_connected - (int)em->num_connected;
2166 /* then the longest running node */
2168 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2172 cmp = (int)myem.pnn - (int)em->pnn;
2179 send out an election request
2181 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2184 TDB_DATA election_data;
2185 struct election_message emsg;
2187 struct ctdb_context *ctdb = rec->ctdb;
2189 srvid = CTDB_SRVID_RECOVERY;
2191 ctdb_election_data(rec, &emsg);
2193 election_data.dsize = sizeof(struct election_message);
2194 election_data.dptr = (unsigned char *)&emsg;
2197 /* first we assume we will win the election and set
2198 recoverymaster to be ourself on the current node
2200 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2202 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2207 /* send an election message to all active nodes */
2208 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2209 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2215 this function will unban all nodes in the cluster
2217 static void unban_all_nodes(struct ctdb_context *ctdb)
2220 struct ctdb_node_map *nodemap;
2221 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2223 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2225 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2229 for (i=0;i<nodemap->num;i++) {
2230 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2231 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2232 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
2236 talloc_free(tmp_ctx);
2241 we think we are winning the election - send a broadcast election request
2243 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2245 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2248 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2250 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2253 talloc_free(rec->send_election_te);
2254 rec->send_election_te = NULL;
2258 handler for memory dumps
2260 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2261 TDB_DATA data, void *private_data)
2263 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2266 struct srvid_request *rd;
2268 if (data.dsize != sizeof(struct srvid_request)) {
2269 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2270 talloc_free(tmp_ctx);
2273 rd = (struct srvid_request *)data.dptr;
2275 dump = talloc_zero(tmp_ctx, TDB_DATA);
2277 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2278 talloc_free(tmp_ctx);
2281 ret = ctdb_dump_memory(ctdb, dump);
2283 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2284 talloc_free(tmp_ctx);
2288 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2290 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2292 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2293 talloc_free(tmp_ctx);
2297 talloc_free(tmp_ctx);
2303 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2304 TDB_DATA data, void *private_data)
2306 struct ctdb_get_log_addr *log_addr;
2309 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2310 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2313 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2315 child = ctdb_fork_no_free_ringbuffer(ctdb);
2316 if (child == (pid_t)-1) {
2317 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2322 ctdb_set_process_name("ctdb_rec_log_collector");
2323 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2324 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2327 ctdb_collect_log(ctdb, log_addr);
2333 handler for clearlog
2335 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2336 TDB_DATA data, void *private_data)
2338 ctdb_clear_log(ctdb);
2342 handler for reload_nodes
2344 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2345 TDB_DATA data, void *private_data)
2347 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2349 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2351 ctdb_load_nodes_file(rec->ctdb);
2355 static void ctdb_rebalance_timeout(struct event_context *ev,
2356 struct timed_event *te,
2357 struct timeval t, void *p)
2359 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2361 if (rec->force_rebalance_nodes == NULL) {
2363 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2368 ("Rebalance timeout occurred - do takeover run\n"));
2369 do_takeover_run(rec, rec->nodemap, false);
2373 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2375 TDB_DATA data, void *private_data)
2380 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2382 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2386 if (data.dsize != sizeof(uint32_t)) {
2387 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2391 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2395 pnn = *(uint32_t *)&data.dptr[0];
2397 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2399 /* Copy any existing list of nodes. There's probably some
2400 * sort of realloc variant that will do this but we need to
2401 * make sure that freeing the old array also cancels the timer
2402 * event for the timeout... not sure if realloc will do that.
2404 len = (rec->force_rebalance_nodes != NULL) ?
2405 talloc_array_length(rec->force_rebalance_nodes) :
2408 /* This allows duplicates to be added but they don't cause
2409 * harm. A call to add a duplicate PNN arguably means that
2410 * the timeout should be reset, so this is the simplest
2413 t = talloc_zero_array(rec, uint32_t, len+1);
2414 CTDB_NO_MEMORY_VOID(ctdb, t);
2416 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2420 talloc_free(rec->force_rebalance_nodes);
2422 rec->force_rebalance_nodes = t;
2423 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2424 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2425 ctdb_rebalance_timeout, rec);
2430 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2431 TDB_DATA data, void *private_data)
2433 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2434 struct ctdb_public_ip *ip;
2436 if (rec->recmaster != rec->ctdb->pnn) {
2437 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2441 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2442 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2446 ip = (struct ctdb_public_ip *)data.dptr;
2448 update_ip_assignment_tree(rec->ctdb, ip);
2452 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2454 TALLOC_FREE(rec->takeover_runs_disable_ctx);
2457 static void reenable_takeover_runs(struct event_context *ev,
2458 struct timed_event *te,
2459 struct timeval yt, void *p)
2461 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2463 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2464 clear_takeover_runs_disable(rec);
2467 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2468 uint64_t srvid, TDB_DATA data,
2471 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2472 struct ctdb_recoverd);
2473 struct srvid_request *r;
2478 /* Validate input data */
2479 if (data.dsize != sizeof(struct srvid_request)) {
2480 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2481 "expecting %lu\n", (long unsigned)data.dsize,
2482 (long unsigned)sizeof(struct srvid_request)));
2486 if (data.dptr == NULL) {
2487 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2492 r = (struct srvid_request *)data.dptr;
2496 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2497 clear_takeover_runs_disable(rec);
2498 ret = ctdb_get_pnn(ctdb);
2502 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
2504 ("Refusing to disable takeover runs on inactive node\n"));
2509 if (rec->takeover_run_in_progress) {
2511 ("Unable to disable takeover runs - in progress\n"));
2516 DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2518 /* Clear any old timers */
2519 clear_takeover_runs_disable(rec);
2521 /* When this is non-NULL it indicates that takeover runs are
2522 * disabled. This context also holds the timeout timer.
2524 rec->takeover_runs_disable_ctx = talloc_new(rec);
2525 if (rec->takeover_runs_disable_ctx == NULL) {
2526 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2531 /* Arrange for the timeout to occur */
2532 event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2533 timeval_current_ofs(timeout, 0),
2534 reenable_takeover_runs,
2537 /* Returning our PNN tells the caller that we succeeded */
2538 ret = ctdb_get_pnn(ctdb);
2540 result.dsize = sizeof(int32_t);
2541 result.dptr = (uint8_t *)&ret;
2542 srvid_request_reply(ctdb, r, result);
2545 /* Backward compatibility for this SRVID - call
2546 * disable_takeover_runs_handler() instead
2548 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2549 TDB_DATA data, void *private_data)
2551 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2552 struct ctdb_recoverd);
2554 struct srvid_request *req;
2556 if (data.dsize != sizeof(uint32_t)) {
2557 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2558 "expecting %lu\n", (long unsigned)data.dsize,
2559 (long unsigned)sizeof(uint32_t)));
2562 if (data.dptr == NULL) {
2563 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2567 req = talloc(ctdb, struct srvid_request);
2568 CTDB_NO_MEMORY_VOID(ctdb, req);
2570 req->srvid = 0; /* No reply */
2572 req->data = *((uint32_t *)data.dptr); /* Timeout */
2574 data2.dsize = sizeof(*req);
2575 data2.dptr = (uint8_t *)req;
2577 disable_takeover_runs_handler(rec->ctdb,
2578 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2583 handler for ip reallocate, just add it to the list of requests and
2584 handle this later in the monitor_cluster loop so we do not recurse
2585 with other requests to takeover_run()
2587 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2588 TDB_DATA data, void *private_data)
2590 struct srvid_request *request;
2591 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2592 struct ctdb_recoverd);
2594 if (data.dsize != sizeof(struct srvid_request)) {
2595 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2599 request = (struct srvid_request *)data.dptr;
2601 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2604 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2605 struct ctdb_recoverd *rec)
2611 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2613 /* update the list of public ips that a node can handle for
2616 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2618 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2620 rec->need_takeover_run = true;
2623 if (do_takeover_run(rec, rec->nodemap, false)) {
2624 ret = ctdb_get_pnn(ctdb);
2630 result.dsize = sizeof(int32_t);
2631 result.dptr = (uint8_t *)&ret;
2633 srvid_requests_reply(ctdb, &rec->reallocate_requests, result);
2638 handler for recovery master elections
2640 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2641 TDB_DATA data, void *private_data)
2643 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2645 struct election_message *em = (struct election_message *)data.dptr;
2646 TALLOC_CTX *mem_ctx;
2648 /* we got an election packet - update the timeout for the election */
2649 talloc_free(rec->election_timeout);
2650 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2652 timeval_current_ofs(0, 500000) :
2653 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2654 ctdb_election_timeout, rec);
2656 mem_ctx = talloc_new(ctdb);
2658 /* someone called an election. check their election data
2659 and if we disagree and we would rather be the elected node,
2660 send a new election message to all other nodes
2662 if (ctdb_election_win(rec, em)) {
2663 if (!rec->send_election_te) {
2664 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2665 timeval_current_ofs(0, 500000),
2666 election_send_request, rec);
2668 talloc_free(mem_ctx);
2669 /*unban_all_nodes(ctdb);*/
2674 talloc_free(rec->send_election_te);
2675 rec->send_election_te = NULL;
2677 if (ctdb->tunable.verify_recovery_lock != 0) {
2678 /* release the recmaster lock */
2679 if (em->pnn != ctdb->pnn &&
2680 ctdb->recovery_lock_fd != -1) {
2681 close(ctdb->recovery_lock_fd);
2682 ctdb->recovery_lock_fd = -1;
2683 unban_all_nodes(ctdb);
2687 /* ok, let that guy become recmaster then */
2688 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2690 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2691 talloc_free(mem_ctx);
2695 talloc_free(mem_ctx);
2701 force the start of the election process
2703 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2704 struct ctdb_node_map *nodemap)
2707 struct ctdb_context *ctdb = rec->ctdb;
2709 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2711 /* set all nodes to recovery mode to stop all internode traffic */
2712 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2714 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2718 talloc_free(rec->election_timeout);
2719 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2721 timeval_current_ofs(0, 500000) :
2722 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2723 ctdb_election_timeout, rec);
2725 ret = send_election_request(rec, pnn);
2727 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2731 /* wait for a few seconds to collect all responses */
2732 ctdb_wait_election(rec);
2738 handler for when a node changes its flags
2740 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2741 TDB_DATA data, void *private_data)
2744 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2745 struct ctdb_node_map *nodemap=NULL;
2746 TALLOC_CTX *tmp_ctx;
2748 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2749 int disabled_flag_changed;
2751 if (data.dsize != sizeof(*c)) {
2752 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2756 tmp_ctx = talloc_new(ctdb);
2757 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2759 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2761 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2762 talloc_free(tmp_ctx);
2767 for (i=0;i<nodemap->num;i++) {
2768 if (nodemap->nodes[i].pnn == c->pnn) break;
2771 if (i == nodemap->num) {
2772 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2773 talloc_free(tmp_ctx);
2777 if (c->old_flags != c->new_flags) {
2778 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2781 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2783 nodemap->nodes[i].flags = c->new_flags;
2785 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2786 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2789 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2790 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2794 ctdb->recovery_master == ctdb->pnn &&
2795 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2796 /* Only do the takeover run if the perm disabled or unhealthy
2797 flags changed since these will cause an ip failover but not
2799 If the node became disconnected or banned this will also
2800 lead to an ip address failover but that is handled
2803 if (disabled_flag_changed) {
2804 rec->need_takeover_run = true;
2808 talloc_free(tmp_ctx);
2812 handler for when we need to push out flag changes ot all other nodes
2814 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2815 TDB_DATA data, void *private_data)
2818 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2819 struct ctdb_node_map *nodemap=NULL;
2820 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2824 /* find the recovery master */
2825 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2827 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2828 talloc_free(tmp_ctx);
2832 /* read the node flags from the recmaster */
2833 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2835 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2836 talloc_free(tmp_ctx);
2839 if (c->pnn >= nodemap->num) {
2840 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2841 talloc_free(tmp_ctx);
2845 /* send the flags update to all connected nodes */
2846 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2848 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2849 nodes, 0, CONTROL_TIMEOUT(),
2853 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2855 talloc_free(tmp_ctx);
2859 talloc_free(tmp_ctx);
2863 struct verify_recmode_normal_data {
2865 enum monitor_result status;
2868 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2870 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2873 /* one more node has responded with recmode data*/
2876 /* if we failed to get the recmode, then return an error and let
2877 the main loop try again.
2879 if (state->state != CTDB_CONTROL_DONE) {
2880 if (rmdata->status == MONITOR_OK) {
2881 rmdata->status = MONITOR_FAILED;
2886 /* if we got a response, then the recmode will be stored in the
2889 if (state->status != CTDB_RECOVERY_NORMAL) {
2890 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2891 rmdata->status = MONITOR_RECOVERY_NEEDED;
2898 /* verify that all nodes are in normal recovery mode */
2899 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2901 struct verify_recmode_normal_data *rmdata;
2902 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2903 struct ctdb_client_control_state *state;
2904 enum monitor_result status;
2907 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2908 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2910 rmdata->status = MONITOR_OK;
2912 /* loop over all active nodes and send an async getrecmode call to
2914 for (j=0; j<nodemap->num; j++) {
2915 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2918 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2920 nodemap->nodes[j].pnn);
2921 if (state == NULL) {
2922 /* we failed to send the control, treat this as
2923 an error and try again next iteration
2925 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2926 talloc_free(mem_ctx);
2927 return MONITOR_FAILED;
2930 /* set up the callback functions */
2931 state->async.fn = verify_recmode_normal_callback;
2932 state->async.private_data = rmdata;
2934 /* one more control to wait for to complete */
2939 /* now wait for up to the maximum number of seconds allowed
2940 or until all nodes we expect a response from has replied
2942 while (rmdata->count > 0) {
2943 event_loop_once(ctdb->ev);
2946 status = rmdata->status;
2947 talloc_free(mem_ctx);
2952 struct verify_recmaster_data {
2953 struct ctdb_recoverd *rec;
2956 enum monitor_result status;
2959 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2961 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2964 /* one more node has responded with recmaster data*/
2967 /* if we failed to get the recmaster, then return an error and let
2968 the main loop try again.
2970 if (state->state != CTDB_CONTROL_DONE) {
2971 if (rmdata->status == MONITOR_OK) {
2972 rmdata->status = MONITOR_FAILED;
2977 /* if we got a response, then the recmaster will be stored in the
2980 if (state->status != rmdata->pnn) {
2981 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2982 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2983 rmdata->status = MONITOR_ELECTION_NEEDED;
2990 /* verify that all nodes agree that we are the recmaster */
2991 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2993 struct ctdb_context *ctdb = rec->ctdb;
2994 struct verify_recmaster_data *rmdata;
2995 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2996 struct ctdb_client_control_state *state;
2997 enum monitor_result status;
3000 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3001 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3005 rmdata->status = MONITOR_OK;
3007 /* loop over all active nodes and send an async getrecmaster call to
3009 for (j=0; j<nodemap->num; j++) {
3010 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3013 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3015 nodemap->nodes[j].pnn);
3016 if (state == NULL) {
3017 /* we failed to send the control, treat this as
3018 an error and try again next iteration
3020 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3021 talloc_free(mem_ctx);
3022 return MONITOR_FAILED;
3025 /* set up the callback functions */
3026 state->async.fn = verify_recmaster_callback;
3027 state->async.private_data = rmdata;
3029 /* one more control to wait for to complete */
3034 /* now wait for up to the maximum number of seconds allowed
3035 or until all nodes we expect a response from has replied
3037 while (rmdata->count > 0) {
3038 event_loop_once(ctdb->ev);
3041 status = rmdata->status;
3042 talloc_free(mem_ctx);
3046 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3047 struct ctdb_recoverd *rec)
3049 struct ctdb_control_get_ifaces *ifaces = NULL;
3050 TALLOC_CTX *mem_ctx;
3053 mem_ctx = talloc_new(NULL);
3055 /* Read the interfaces from the local node */
3056 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3057 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3058 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3059 /* We could return an error. However, this will be
3060 * rare so we'll decide that the interfaces have
3061 * actually changed, just in case.
3063 talloc_free(mem_ctx);
3068 /* We haven't been here before so things have changed */
3069 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3071 } else if (rec->ifaces->num != ifaces->num) {
3072 /* Number of interfaces has changed */
3073 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3074 rec->ifaces->num, ifaces->num));
3077 /* See if interface names or link states have changed */
3079 for (i = 0; i < rec->ifaces->num; i++) {
3080 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3081 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3083 ("Interface in slot %d changed: %s => %s\n",
3084 i, iface->name, ifaces->ifaces[i].name));
3088 if (iface->link_state != ifaces->ifaces[i].link_state) {
3090 ("Interface %s changed state: %d => %d\n",
3091 iface->name, iface->link_state,
3092 ifaces->ifaces[i].link_state));
3099 talloc_free(rec->ifaces);
3100 rec->ifaces = talloc_steal(rec, ifaces);
3102 talloc_free(mem_ctx);
3106 /* called to check that the local allocation of public ip addresses is ok.
3108 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3110 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3111 struct ctdb_uptime *uptime1 = NULL;
3112 struct ctdb_uptime *uptime2 = NULL;
3114 bool need_takeover_run = false;
3116 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3117 CTDB_CURRENT_NODE, &uptime1);
3119 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3120 talloc_free(mem_ctx);
3124 if (interfaces_have_changed(ctdb, rec)) {
3125 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3126 "local node %u - force takeover run\n",
3128 need_takeover_run = true;
3131 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3132 CTDB_CURRENT_NODE, &uptime2);
3134 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3135 talloc_free(mem_ctx);
3139 /* skip the check if the startrecovery time has changed */
3140 if (timeval_compare(&uptime1->last_recovery_started,
3141 &uptime2->last_recovery_started) != 0) {
3142 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3143 talloc_free(mem_ctx);
3147 /* skip the check if the endrecovery time has changed */
3148 if (timeval_compare(&uptime1->last_recovery_finished,
3149 &uptime2->last_recovery_finished) != 0) {
3150 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3151 talloc_free(mem_ctx);
3155 /* skip the check if we have started but not finished recovery */
3156 if (timeval_compare(&uptime1->last_recovery_finished,
3157 &uptime1->last_recovery_started) != 1) {
3158 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3159 talloc_free(mem_ctx);
3164 /* verify that we have the ip addresses we should have
3165 and we dont have ones we shouldnt have.
3166 if we find an inconsistency we set recmode to
3167 active on the local node and wait for the recmaster
3168 to do a full blown recovery.
3169 also if the pnn is -1 and we are healthy and can host the ip
3170 we also request a ip reallocation.
3172 if (ctdb->tunable.disable_ip_failover == 0) {
3173 struct ctdb_all_public_ips *ips = NULL;
3175 /* read the *available* IPs from the local node */
3176 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3178 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3179 talloc_free(mem_ctx);
3183 for (j=0; j<ips->num; j++) {
3184 if (ips->ips[j].pnn == -1 &&
3185 nodemap->nodes[pnn].flags == 0) {
3186 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3187 ctdb_addr_to_str(&ips->ips[j].addr)));
3188 need_takeover_run = true;
3194 /* read the *known* IPs from the local node */
3195 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3197 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3198 talloc_free(mem_ctx);
3202 for (j=0; j<ips->num; j++) {
3203 if (ips->ips[j].pnn == pnn) {
3204 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3205 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3206 ctdb_addr_to_str(&ips->ips[j].addr)));
3207 need_takeover_run = true;
3210 if (ctdb->do_checkpublicip &&
3211 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3213 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3214 ctdb_addr_to_str(&ips->ips[j].addr)));
3216 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3217 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3224 if (need_takeover_run) {
3225 struct srvid_request rd;
3228 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3232 data.dptr = (uint8_t *)&rd;
3233 data.dsize = sizeof(rd);
3235 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3237 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3240 talloc_free(mem_ctx);
3245 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3247 struct ctdb_node_map **remote_nodemaps = callback_data;
3249 if (node_pnn >= ctdb->num_nodes) {
3250 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3254 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3258 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3259 struct ctdb_node_map *nodemap,
3260 struct ctdb_node_map **remote_nodemaps)
3264 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3265 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3267 CONTROL_TIMEOUT(), false, tdb_null,
3268 async_getnodemap_callback,
3270 remote_nodemaps) != 0) {
3271 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3279 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3280 struct ctdb_check_reclock_state {
3281 struct ctdb_context *ctdb;
3282 struct timeval start_time;
3285 struct timed_event *te;
3286 struct fd_event *fde;
3287 enum reclock_child_status status;
3290 /* when we free the reclock state we must kill any child process.
3292 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3294 struct ctdb_context *ctdb = state->ctdb;
3296 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3298 if (state->fd[0] != -1) {
3299 close(state->fd[0]);
3302 if (state->fd[1] != -1) {
3303 close(state->fd[1]);
3306 ctdb_kill(ctdb, state->child, SIGKILL);
3311 called if our check_reclock child times out. this would happen if
3312 i/o to the reclock file blocks.
3314 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3315 struct timeval t, void *private_data)
3317 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3318 struct ctdb_check_reclock_state);
3320 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3321 state->status = RECLOCK_TIMEOUT;
3324 /* this is called when the child process has completed checking the reclock
3325 file and has written data back to us through the pipe.
3327 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3328 uint16_t flags, void *private_data)
3330 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3331 struct ctdb_check_reclock_state);
3335 /* we got a response from our child process so we can abort the
3338 talloc_free(state->te);
3341 ret = read(state->fd[0], &c, 1);
3342 if (ret != 1 || c != RECLOCK_OK) {
3343 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3344 state->status = RECLOCK_FAILED;
3349 state->status = RECLOCK_OK;
3353 static int check_recovery_lock(struct ctdb_context *ctdb)
3356 struct ctdb_check_reclock_state *state;
3357 pid_t parent = getpid();
3359 if (ctdb->recovery_lock_fd == -1) {
3360 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3364 state = talloc(ctdb, struct ctdb_check_reclock_state);
3365 CTDB_NO_MEMORY(ctdb, state);
3368 state->start_time = timeval_current();
3369 state->status = RECLOCK_CHECKING;
3373 ret = pipe(state->fd);
3376 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3380 state->child = ctdb_fork(ctdb);
3381 if (state->child == (pid_t)-1) {
3382 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3383 close(state->fd[0]);
3385 close(state->fd[1]);
3391 if (state->child == 0) {
3392 char cc = RECLOCK_OK;
3393 close(state->fd[0]);
3396 ctdb_set_process_name("ctdb_rec_reclock");
3397 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3398 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3399 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3400 cc = RECLOCK_FAILED;
3403 write(state->fd[1], &cc, 1);
3404 /* make sure we die when our parent dies */
3405 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3410 close(state->fd[1]);
3412 set_close_on_exec(state->fd[0]);
3414 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3416 talloc_set_destructor(state, check_reclock_destructor);
3418 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3419 ctdb_check_reclock_timeout, state);
3420 if (state->te == NULL) {
3421 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3426 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3428 reclock_child_handler,
3431 if (state->fde == NULL) {
3432 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3436 tevent_fd_set_auto_close(state->fde);
3438 while (state->status == RECLOCK_CHECKING) {
3439 event_loop_once(ctdb->ev);
3442 if (state->status == RECLOCK_FAILED) {
3443 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3444 close(ctdb->recovery_lock_fd);
3445 ctdb->recovery_lock_fd = -1;
3454 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3456 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3457 const char *reclockfile;
3459 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3460 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3461 talloc_free(tmp_ctx);
3465 if (reclockfile == NULL) {
3466 if (ctdb->recovery_lock_file != NULL) {
3467 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3468 talloc_free(ctdb->recovery_lock_file);
3469 ctdb->recovery_lock_file = NULL;
3470 if (ctdb->recovery_lock_fd != -1) {
3471 close(ctdb->recovery_lock_fd);
3472 ctdb->recovery_lock_fd = -1;
3475 ctdb->tunable.verify_recovery_lock = 0;
3476 talloc_free(tmp_ctx);
3480 if (ctdb->recovery_lock_file == NULL) {
3481 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3482 if (ctdb->recovery_lock_fd != -1) {
3483 close(ctdb->recovery_lock_fd);
3484 ctdb->recovery_lock_fd = -1;
3486 talloc_free(tmp_ctx);
3491 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3492 talloc_free(tmp_ctx);
3496 talloc_free(ctdb->recovery_lock_file);
3497 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3498 ctdb->tunable.verify_recovery_lock = 0;
3499 if (ctdb->recovery_lock_fd != -1) {
3500 close(ctdb->recovery_lock_fd);
3501 ctdb->recovery_lock_fd = -1;
3504 talloc_free(tmp_ctx);
3508 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3509 TALLOC_CTX *mem_ctx)
3512 struct ctdb_node_map *nodemap=NULL;
3513 struct ctdb_node_map *recmaster_nodemap=NULL;
3514 struct ctdb_node_map **remote_nodemaps=NULL;
3515 struct ctdb_vnn_map *vnnmap=NULL;
3516 struct ctdb_vnn_map *remote_vnnmap=NULL;
3517 int32_t debug_level;
3522 /* verify that the main daemon is still running */
3523 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3524 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3528 /* ping the local daemon to tell it we are alive */
3529 ctdb_ctrl_recd_ping(ctdb);
3531 if (rec->election_timeout) {
3532 /* an election is in progress */
3536 /* read the debug level from the parent and update locally */
3537 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3539 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3542 LogLevel = debug_level;
3544 /* get relevant tunables */
3545 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3547 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3551 /* get the current recovery lock file from the server */
3552 if (update_recovery_lock_file(ctdb) != 0) {
3553 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3557 /* Make sure that if recovery lock verification becomes disabled when
3560 if (ctdb->tunable.verify_recovery_lock == 0) {
3561 if (ctdb->recovery_lock_fd != -1) {
3562 close(ctdb->recovery_lock_fd);
3563 ctdb->recovery_lock_fd = -1;
3567 pnn = ctdb_get_pnn(ctdb);
3569 /* get the vnnmap */
3570 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3572 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3577 /* get number of nodes */
3579 talloc_free(rec->nodemap);
3580 rec->nodemap = NULL;
3583 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3585 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3588 nodemap = rec->nodemap;
3590 /* remember our own node flags */
3591 rec->node_flags = nodemap->nodes[pnn].flags;
3593 ban_misbehaving_nodes(rec, &self_ban);
3595 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3599 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3600 also frozen and that the recmode is set to active.
3602 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3603 /* If this node has become inactive then we want to
3604 * reduce the chances of it taking over the recovery
3605 * master role when it becomes active again. This
3606 * helps to stabilise the recovery master role so that
3607 * it stays on the most stable node.
3609 rec->priority_time = timeval_current();
3611 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3613 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3615 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3616 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3618 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3620 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3623 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3625 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3631 /* If this node is stopped or banned then it is not the recovery
3632 * master, so don't do anything. This prevents stopped or banned
3633 * node from starting election and sending unnecessary controls.
3638 /* check which node is the recovery master */
3639 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3641 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3645 /* If we are not the recmaster then do some housekeeping */
3646 if (rec->recmaster != pnn) {
3647 /* Ignore any IP reallocate requests - only recmaster
3650 TALLOC_FREE(rec->reallocate_requests);
3651 /* Clear any nodes that should be force rebalanced in
3652 * the next takeover run. If the recovery master role
3653 * has moved then we don't want to process these some
3654 * time in the future.
3656 TALLOC_FREE(rec->force_rebalance_nodes);
3659 /* This is a special case. When recovery daemon is started, recmaster
3660 * is set to -1. If a node is not started in stopped state, then
3661 * start election to decide recovery master
3663 if (rec->recmaster == (uint32_t)-1) {
3664 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3665 force_election(rec, pnn, nodemap);
3669 /* update the capabilities for all nodes */
3670 ret = update_capabilities(ctdb, nodemap);
3672 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3677 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3678 * but we have, then force an election and try to become the new
3681 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3682 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3683 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3684 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3685 " but we (node %u) have - force an election\n",
3686 rec->recmaster, pnn));
3687 force_election(rec, pnn, nodemap);
3691 /* count how many active nodes there are */
3692 rec->num_active = 0;
3693 rec->num_lmasters = 0;
3694 rec->num_connected = 0;
3695 for (i=0; i<nodemap->num; i++) {
3696 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3698 if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3699 rec->num_lmasters++;
3702 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3703 rec->num_connected++;
3708 /* verify that the recmaster node is still active */
3709 for (j=0; j<nodemap->num; j++) {
3710 if (nodemap->nodes[j].pnn==rec->recmaster) {
3715 if (j == nodemap->num) {
3716 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3717 force_election(rec, pnn, nodemap);
3721 /* if recovery master is disconnected we must elect a new recmaster */
3722 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3723 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3724 force_election(rec, pnn, nodemap);
3728 /* get nodemap from the recovery master to check if it is inactive */
3729 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3730 mem_ctx, &recmaster_nodemap);
3732 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3733 nodemap->nodes[j].pnn));
3738 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3739 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3740 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3742 * update our nodemap to carry the recmaster's notion of
3743 * its own flags, so that we don't keep freezing the
3744 * inactive recmaster node...
3746 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3747 force_election(rec, pnn, nodemap);
3751 /* verify that we have all ip addresses we should have and we dont
3752 * have addresses we shouldnt have.
3754 if (ctdb->tunable.disable_ip_failover == 0 &&
3755 rec->takeover_runs_disable_ctx == NULL) {
3756 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3757 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3762 /* if we are not the recmaster then we do not need to check
3763 if recovery is needed
3765 if (pnn != rec->recmaster) {
3770 /* ensure our local copies of flags are right */
3771 ret = update_local_flags(rec, nodemap);
3772 if (ret == MONITOR_ELECTION_NEEDED) {
3773 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3774 force_election(rec, pnn, nodemap);
3777 if (ret != MONITOR_OK) {
3778 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3782 if (ctdb->num_nodes != nodemap->num) {
3783 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3784 ctdb_load_nodes_file(ctdb);
3788 /* verify that all active nodes agree that we are the recmaster */
3789 switch (verify_recmaster(rec, nodemap, pnn)) {
3790 case MONITOR_RECOVERY_NEEDED:
3791 /* can not happen */
3793 case MONITOR_ELECTION_NEEDED:
3794 force_election(rec, pnn, nodemap);
3798 case MONITOR_FAILED:
3803 if (rec->need_recovery) {
3804 /* a previous recovery didn't finish */
3805 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3809 /* verify that all active nodes are in normal mode
3810 and not in recovery mode
3812 switch (verify_recmode(ctdb, nodemap)) {
3813 case MONITOR_RECOVERY_NEEDED:
3814 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3816 case MONITOR_FAILED:
3818 case MONITOR_ELECTION_NEEDED:
3819 /* can not happen */
3825 if (ctdb->tunable.verify_recovery_lock != 0) {
3826 /* we should have the reclock - check its not stale */
3827 ret = check_recovery_lock(ctdb);
3829 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3830 ctdb_set_culprit(rec, ctdb->pnn);
3831 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3837 /* if there are takeovers requested, perform it and notify the waiters */
3838 if (rec->takeover_runs_disable_ctx == NULL &&
3839 rec->reallocate_requests) {
3840 process_ipreallocate_requests(ctdb, rec);
3843 /* get the nodemap for all active remote nodes
3845 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3846 if (remote_nodemaps == NULL) {
3847 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3850 for(i=0; i<nodemap->num; i++) {
3851 remote_nodemaps[i] = NULL;
3853 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3854 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3858 /* verify that all other nodes have the same nodemap as we have
3860 for (j=0; j<nodemap->num; j++) {
3861 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3865 if (remote_nodemaps[j] == NULL) {
3866 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3867 ctdb_set_culprit(rec, j);
3872 /* if the nodes disagree on how many nodes there are
3873 then this is a good reason to try recovery
3875 if (remote_nodemaps[j]->num != nodemap->num) {
3876 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3877 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3878 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3879 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3883 /* if the nodes disagree on which nodes exist and are
3884 active, then that is also a good reason to do recovery
3886 for (i=0;i<nodemap->num;i++) {
3887 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3888 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3889 nodemap->nodes[j].pnn, i,
3890 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3891 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3892 do_recovery(rec, mem_ctx, pnn, nodemap,
3900 * Update node flags obtained from each active node. This ensure we have
3901 * up-to-date information for all the nodes.
3903 for (j=0; j<nodemap->num; j++) {
3904 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3907 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3910 for (j=0; j<nodemap->num; j++) {
3911 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3915 /* verify the flags are consistent
3917 for (i=0; i<nodemap->num; i++) {
3918 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3922 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3923 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3924 nodemap->nodes[j].pnn,
3925 nodemap->nodes[i].pnn,
3926 remote_nodemaps[j]->nodes[i].flags,
3927 nodemap->nodes[i].flags));
3929 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3930 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3931 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3932 do_recovery(rec, mem_ctx, pnn, nodemap,
3936 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3937 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3938 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3939 do_recovery(rec, mem_ctx, pnn, nodemap,
3948 /* There must be the same number of lmasters in the vnn map as
3949 * there are active nodes with the lmaster capability... or
3952 if (vnnmap->size != rec->num_lmasters) {
3953 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3954 vnnmap->size, rec->num_lmasters));
3955 ctdb_set_culprit(rec, ctdb->pnn);
3956 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3960 /* verify that all active nodes in the nodemap also exist in
3963 for (j=0; j<nodemap->num; j++) {
3964 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3967 if (nodemap->nodes[j].pnn == pnn) {
3971 for (i=0; i<vnnmap->size; i++) {
3972 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3976 if (i == vnnmap->size) {
3977 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3978 nodemap->nodes[j].pnn));
3979 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3980 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3986 /* verify that all other nodes have the same vnnmap
3987 and are from the same generation
3989 for (j=0; j<nodemap->num; j++) {
3990 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3993 if (nodemap->nodes[j].pnn == pnn) {
3997 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3998 mem_ctx, &remote_vnnmap);
4000 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
4001 nodemap->nodes[j].pnn));
4005 /* verify the vnnmap generation is the same */
4006 if (vnnmap->generation != remote_vnnmap->generation) {
4007 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4008 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4009 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4010 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4014 /* verify the vnnmap size is the same */
4015 if (vnnmap->size != remote_vnnmap->size) {
4016 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4017 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4018 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4019 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4023 /* verify the vnnmap is the same */
4024 for (i=0;i<vnnmap->size;i++) {
4025 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4026 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4027 nodemap->nodes[j].pnn));
4028 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4029 do_recovery(rec, mem_ctx, pnn, nodemap,
4036 /* we might need to change who has what IP assigned */
4037 if (rec->need_takeover_run) {
4038 uint32_t culprit = (uint32_t)-1;
4040 rec->need_takeover_run = false;
4042 /* update the list of public ips that a node can handle for
4045 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4047 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4049 rec->need_takeover_run = true;
4053 /* execute the "startrecovery" event script on all nodes */
4054 ret = run_startrecovery_eventscript(rec, nodemap);
4056 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4057 ctdb_set_culprit(rec, ctdb->pnn);
4058 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4062 /* If takeover run fails, then the offending nodes are
4063 * assigned ban culprit counts. And we re-try takeover.
4064 * If takeover run fails repeatedly, the node would get
4067 * If rec->need_takeover_run is not set to true at this
4068 * failure, monitoring is disabled cluster-wide (via
4069 * startrecovery eventscript) and will not get enabled.
4071 if (!do_takeover_run(rec, nodemap, true)) {
4075 /* execute the "recovered" event script on all nodes */
4076 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4078 // we cant check whether the event completed successfully
4079 // since this script WILL fail if the node is in recovery mode
4080 // and if that race happens, the code here would just cause a second
4081 // cascading recovery.
4083 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4084 ctdb_set_culprit(rec, ctdb->pnn);
4085 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4092 the main monitoring loop
4094 static void monitor_cluster(struct ctdb_context *ctdb)
4096 struct ctdb_recoverd *rec;
4098 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4100 rec = talloc_zero(ctdb, struct ctdb_recoverd);
4101 CTDB_NO_MEMORY_FATAL(ctdb, rec);
4105 rec->takeover_run_in_progress = false;
4107 rec->priority_time = timeval_current();
4109 /* register a message port for sending memory dumps */
4110 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4112 /* register a message port for requesting logs */
4113 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
4115 /* register a message port for clearing logs */
4116 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
4118 /* register a message port for recovery elections */
4119 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4121 /* when nodes are disabled/enabled */
4122 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4124 /* when we are asked to puch out a flag change */
4125 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4127 /* register a message port for vacuum fetch */
4128 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4130 /* register a message port for reloadnodes */
4131 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4133 /* register a message port for performing a takeover run */
4134 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4136 /* register a message port for disabling the ip check for a short while */
4137 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4139 /* register a message port for updating the recovery daemons node assignment for an ip */
4140 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4142 /* register a message port for forcing a rebalance of a node next
4144 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4146 /* Register a message port for disabling takeover runs */
4147 ctdb_client_set_message_handler(ctdb,
4148 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4149 disable_takeover_runs_handler, rec);
4152 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4153 struct timeval start;
4157 DEBUG(DEBUG_CRIT,(__location__
4158 " Failed to create temp context\n"));
4162 start = timeval_current();
4163 main_loop(ctdb, rec, mem_ctx);
4164 talloc_free(mem_ctx);
4166 /* we only check for recovery once every second */
4167 elapsed = timeval_elapsed(&start);
4168 if (elapsed < ctdb->tunable.recover_interval) {
4169 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4176 event handler for when the main ctdbd dies
4178 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4179 uint16_t flags, void *private_data)
4181 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4186 called regularly to verify that the recovery daemon is still running
4188 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4189 struct timeval yt, void *p)
4191 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4193 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4194 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4196 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4197 ctdb_restart_recd, ctdb);
4202 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4203 timeval_current_ofs(30, 0),
4204 ctdb_check_recd, ctdb);
4207 static void recd_sig_child_handler(struct event_context *ev,
4208 struct signal_event *se, int signum, int count,
4212 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4217 pid = waitpid(-1, &status, WNOHANG);
4219 if (errno != ECHILD) {
4220 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4225 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4231 startup the recovery daemon as a child of the main ctdb daemon
4233 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4236 struct signal_event *se;
4237 struct tevent_fd *fde;
4239 if (pipe(fd) != 0) {
4243 ctdb->ctdbd_pid = getpid();
4245 ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4246 if (ctdb->recoverd_pid == -1) {
4250 if (ctdb->recoverd_pid != 0) {
4251 talloc_free(ctdb->recd_ctx);
4252 ctdb->recd_ctx = talloc_new(ctdb);
4253 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4256 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4257 timeval_current_ofs(30, 0),
4258 ctdb_check_recd, ctdb);
4264 srandom(getpid() ^ time(NULL));
4266 /* Clear the log ringbuffer */
4267 ctdb_clear_log(ctdb);
4269 ctdb_set_process_name("ctdb_recovered");
4270 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4271 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4275 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4277 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4278 ctdb_recoverd_parent, &fd[0]);
4279 tevent_fd_set_auto_close(fde);
4281 /* set up a handler to pick up sigchld */
4282 se = event_add_signal(ctdb->ev, ctdb,
4284 recd_sig_child_handler,
4287 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4291 monitor_cluster(ctdb);
4293 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4298 shutdown the recovery daemon
4300 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4302 if (ctdb->recoverd_pid == 0) {
4306 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4307 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4309 TALLOC_FREE(ctdb->recd_ctx);
4310 TALLOC_FREE(ctdb->recd_ping_count);
4313 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4314 struct timeval t, void *private_data)
4316 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4318 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4319 ctdb_stop_recoverd(ctdb);
4320 ctdb_start_recoverd(ctdb);