4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tdb/include/tdb.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/filesys.h"
25 #include "system/wait.h"
26 #include "../include/ctdb_private.h"
27 #include "lib/util/dlinklist.h"
31 lock all databases - mark only
33 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
35 struct ctdb_db_context *ctdb_db;
37 /* these are internal tdb functions */
38 int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
39 int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
41 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
42 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
46 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
47 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
50 /* The dual loop is a woraround for older versions of samba
51 that does not yet support the set-db-priority/lock order
52 call. So that we get basic deadlock avoiidance also for
53 these old versions of samba.
54 This code will be removed in the future.
56 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
57 if (ctdb_db->priority != priority) {
60 if (strstr(ctdb_db->db_name, "notify") != NULL) {
63 if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
66 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
67 tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb);
71 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
72 if (ctdb_db->priority != priority) {
75 if (strstr(ctdb_db->db_name, "notify") == NULL) {
78 if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
81 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
82 tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb);
90 lock all databases - unmark only
92 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
94 struct ctdb_db_context *ctdb_db;
96 /* this is an internal tdb functions */
97 int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
99 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
100 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
104 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
105 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
108 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
109 if (ctdb_db->priority != priority) {
112 tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb);
113 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
122 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
124 CHECK_CONTROL_DATA_SIZE(0);
125 struct ctdb_vnn_map_wire *map;
128 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
129 map = talloc_size(outdata, len);
130 CTDB_NO_MEMORY(ctdb, map);
132 map->generation = ctdb->vnn_map->generation;
133 map->size = ctdb->vnn_map->size;
134 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
136 outdata->dsize = len;
137 outdata->dptr = (uint8_t *)map;
143 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
145 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
148 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
149 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
150 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
155 talloc_free(ctdb->vnn_map);
157 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
158 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
160 ctdb->vnn_map->generation = map->generation;
161 ctdb->vnn_map->size = map->size;
162 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
163 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
165 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
171 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
174 struct ctdb_db_context *ctdb_db;
175 struct ctdb_dbid_map *dbid_map;
177 CHECK_CONTROL_DATA_SIZE(0);
180 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
185 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
186 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
187 if (!outdata->dptr) {
188 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
192 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
194 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
195 dbid_map->dbs[i].dbid = ctdb_db->db_id;
196 if (ctdb_db->persistent != 0) {
197 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_PERSISTENT;
199 if (ctdb_db->readonly != 0) {
200 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_READONLY;
202 if (ctdb_db->sticky != 0) {
203 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_STICKY;
211 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
213 uint32_t i, num_nodes;
214 struct ctdb_node_map *node_map;
216 CHECK_CONTROL_DATA_SIZE(0);
218 num_nodes = ctdb->num_nodes;
220 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
221 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
222 if (!outdata->dptr) {
223 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
227 node_map = (struct ctdb_node_map *)outdata->dptr;
228 node_map->num = num_nodes;
229 for (i=0; i<num_nodes; i++) {
230 if (parse_ip(ctdb->nodes[i]->address.address,
231 NULL, /* TODO: pass in the correct interface here*/
233 &node_map->nodes[i].addr) == 0)
235 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
238 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
239 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
246 get an old style ipv4-only nodemap
249 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
251 uint32_t i, num_nodes;
252 struct ctdb_node_mapv4 *node_map;
254 CHECK_CONTROL_DATA_SIZE(0);
256 num_nodes = ctdb->num_nodes;
258 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
259 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
260 if (!outdata->dptr) {
261 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
265 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
266 node_map->num = num_nodes;
267 for (i=0; i<num_nodes; i++) {
268 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
269 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
273 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
274 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
281 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
282 struct timeval t, void *private_data)
285 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
287 struct ctdb_node **nodes;
289 tmp_ctx = talloc_new(ctdb);
291 /* steal the old nodes file for a while */
292 talloc_steal(tmp_ctx, ctdb->nodes);
295 num_nodes = ctdb->num_nodes;
298 /* load the new nodes file */
299 ctdb_load_nodes_file(ctdb);
301 for (i=0; i<ctdb->num_nodes; i++) {
302 /* keep any identical pre-existing nodes and connections */
303 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
304 talloc_free(ctdb->nodes[i]);
305 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
309 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
313 /* any new or different nodes must be added */
314 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
315 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
316 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
318 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
319 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
320 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
324 /* tell the recovery daemon to reaload the nodes file too */
325 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
327 talloc_free(tmp_ctx);
332 reload the nodes file after a short delay (so that we can send the response
336 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
338 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
344 a traverse function for pulling all relevent records from pulldb
347 struct ctdb_context *ctdb;
348 struct ctdb_marshall_buffer *pulldata;
353 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
355 struct pulldb_data *params = (struct pulldb_data *)p;
356 struct ctdb_rec_data *rec;
358 /* add the record to the blob */
359 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
361 params->failed = true;
364 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
365 if (params->pulldata == NULL) {
366 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len));
367 ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n");
369 params->pulldata->count++;
370 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
371 params->len += rec->length;
378 pul a bunch of records from a ltdb, filtering by lmaster
380 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
382 struct ctdb_control_pulldb *pull;
383 struct ctdb_db_context *ctdb_db;
384 struct pulldb_data params;
385 struct ctdb_marshall_buffer *reply;
387 pull = (struct ctdb_control_pulldb *)indata.dptr;
389 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
391 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
395 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
396 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
400 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
401 CTDB_NO_MEMORY(ctdb, reply);
403 reply->db_id = pull->db_id;
406 params.pulldata = reply;
407 params.len = offsetof(struct ctdb_marshall_buffer, data);
408 params.failed = false;
410 if (ctdb_db->unhealthy_reason) {
411 /* this is just a warning, as the tdb should be empty anyway */
412 DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
413 ctdb_db->db_name, ctdb_db->unhealthy_reason));
416 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
417 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
421 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
422 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
423 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
424 talloc_free(params.pulldata);
428 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
430 outdata->dptr = (uint8_t *)params.pulldata;
431 outdata->dsize = params.len;
437 push a bunch of records into a ltdb, filtering by rsn
439 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
441 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
442 struct ctdb_db_context *ctdb_db;
444 struct ctdb_rec_data *rec;
446 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
447 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
451 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
453 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
457 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
458 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
462 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
463 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
467 rec = (struct ctdb_rec_data *)&reply->data[0];
469 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
470 reply->count, reply->db_id));
472 for (i=0;i<reply->count;i++) {
474 struct ctdb_ltdb_header *hdr;
476 key.dptr = &rec->data[0];
477 key.dsize = rec->keylen;
478 data.dptr = &rec->data[key.dsize];
479 data.dsize = rec->datalen;
481 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
482 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
485 hdr = (struct ctdb_ltdb_header *)data.dptr;
486 /* strip off any read only record flags. All readonly records
487 are revoked implicitely by a recovery
489 hdr->flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
491 data.dptr += sizeof(*hdr);
492 data.dsize -= sizeof(*hdr);
494 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
496 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
500 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
503 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
504 reply->count, reply->db_id));
506 if (ctdb_db->readonly) {
507 DEBUG(DEBUG_CRIT,("Clearing the tracking database for dbid 0x%x\n",
509 if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
510 DEBUG(DEBUG_ERR,("Failed to wipe tracking database for 0x%x. Dropping read-only delegation support\n", ctdb_db->db_id));
511 ctdb_db->readonly = false;
512 tdb_close(ctdb_db->rottdb);
513 ctdb_db->rottdb = NULL;
514 ctdb_db->readonly = false;
516 while (ctdb_db->revokechild_active != NULL) {
517 talloc_free(ctdb_db->revokechild_active);
521 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
525 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
530 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
532 uint32_t *dmaster = (uint32_t *)p;
533 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
536 /* skip if already correct */
537 if (header->dmaster == *dmaster) {
541 header->dmaster = *dmaster;
543 ret = tdb_store(tdb, key, data, TDB_REPLACE);
545 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
549 /* TODO: add error checking here */
554 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
556 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
557 struct ctdb_db_context *ctdb_db;
559 ctdb_db = find_ctdb_db(ctdb, p->db_id);
561 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
565 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
566 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
570 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
571 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
575 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
577 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
582 struct ctdb_set_recmode_state {
583 struct ctdb_context *ctdb;
584 struct ctdb_req_control *c;
587 struct timed_event *te;
588 struct fd_event *fde;
590 struct timeval start_time;
594 called if our set_recmode child times out. this would happen if
595 ctdb_recovery_lock() would block.
597 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
598 struct timeval t, void *private_data)
600 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
601 struct ctdb_set_recmode_state);
603 /* we consider this a success, not a failure, as we failed to
604 set the recovery lock which is what we wanted. This can be
605 caused by the cluster filesystem being very slow to
606 arbitrate locks immediately after a node failure.
608 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
609 state->ctdb->recovery_mode = state->recmode;
610 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
615 /* when we free the recmode state we must kill any child process.
617 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
619 double l = timeval_elapsed(&state->start_time);
621 CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", reclock.ctdbd, l);
623 if (state->fd[0] != -1) {
626 if (state->fd[1] != -1) {
629 ctdb_kill(state->ctdb, state->child, SIGKILL);
633 /* this is called when the client process has completed ctdb_recovery_lock()
634 and has written data back to us through the pipe.
636 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
637 uint16_t flags, void *private_data)
639 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
640 struct ctdb_set_recmode_state);
644 /* we got a response from our child process so we can abort the
647 talloc_free(state->te);
651 /* read the childs status when trying to lock the reclock file.
652 child wrote 0 if everything is fine and 1 if it did manage
653 to lock the file, which would be a problem since that means
654 we got a request to exit from recovery but we could still lock
655 the file which at this time SHOULD be locked by the recovery
656 daemon on the recmaster
658 ret = read(state->fd[0], &c, 1);
659 if (ret != 1 || c != 0) {
660 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
665 state->ctdb->recovery_mode = state->recmode;
667 /* release any deferred attach calls from clients */
668 if (state->recmode == CTDB_RECOVERY_NORMAL) {
669 ctdb_process_deferred_attach(state->ctdb);
672 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
678 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
679 struct timeval t, void *private_data)
681 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
683 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
684 talloc_free(ctdb->release_ips_ctx);
685 ctdb->release_ips_ctx = NULL;
687 ctdb_release_all_ips(ctdb);
691 * Set up an event to drop all public ips if we remain in recovery for too
694 int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
696 if (ctdb->release_ips_ctx != NULL) {
697 talloc_free(ctdb->release_ips_ctx);
699 ctdb->release_ips_ctx = talloc_new(ctdb);
700 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
702 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
707 set the recovery mode
709 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
710 struct ctdb_req_control *c,
711 TDB_DATA indata, bool *async_reply,
712 const char **errormsg)
714 uint32_t recmode = *(uint32_t *)indata.dptr;
716 struct ctdb_set_recmode_state *state;
717 pid_t parent = getpid();
719 /* if we enter recovery but stay in recovery for too long
720 we will eventually drop all our ip addresses
722 if (recmode == CTDB_RECOVERY_NORMAL) {
723 talloc_free(ctdb->release_ips_ctx);
724 ctdb->release_ips_ctx = NULL;
726 if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
727 DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n"));
731 if (recmode != ctdb->recovery_mode) {
732 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
733 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
736 if (recmode != CTDB_RECOVERY_NORMAL ||
737 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
738 ctdb->recovery_mode = recmode;
742 /* some special handling when ending recovery mode */
744 /* force the databases to thaw */
745 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
746 if (ctdb->freeze_handles[i] != NULL) {
747 ctdb_control_thaw(ctdb, i);
751 state = talloc(ctdb, struct ctdb_set_recmode_state);
752 CTDB_NO_MEMORY(ctdb, state);
754 state->start_time = timeval_current();
758 /* release any deferred attach calls from clients */
759 if (recmode == CTDB_RECOVERY_NORMAL) {
760 ctdb_process_deferred_attach(ctdb);
763 if (ctdb->tunable.verify_recovery_lock == 0) {
764 /* dont need to verify the reclock file */
765 ctdb->recovery_mode = recmode;
769 /* For the rest of what needs to be done, we need to do this in
770 a child process since
771 1, the call to ctdb_recovery_lock() can block if the cluster
772 filesystem is in the process of recovery.
774 ret = pipe(state->fd);
777 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
781 state->child = ctdb_fork(ctdb);
782 if (state->child == (pid_t)-1) {
789 if (state->child == 0) {
793 debug_extra = talloc_asprintf(NULL, "set_recmode:");
794 /* we should not be able to get the lock on the reclock file,
795 as it should be held by the recovery master
797 if (ctdb_recovery_lock(ctdb, false)) {
798 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
802 write(state->fd[1], &cc, 1);
803 /* make sure we die when our parent dies */
804 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
806 write(state->fd[1], &cc, 1);
811 set_close_on_exec(state->fd[0]);
815 talloc_set_destructor(state, set_recmode_destructor);
817 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
819 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
820 ctdb_set_recmode_timeout, state);
822 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
827 if (state->fde == NULL) {
831 tevent_fd_set_auto_close(state->fde);
834 state->recmode = recmode;
835 state->c = talloc_steal(state, c);
844 try and get the recovery lock in shared storage - should only work
845 on the recovery master recovery daemon. Anywhere else is a bug
847 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
852 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
854 if (ctdb->recovery_lock_fd != -1) {
855 close(ctdb->recovery_lock_fd);
856 ctdb->recovery_lock_fd = -1;
859 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
860 if (ctdb->recovery_lock_fd == -1) {
861 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
862 ctdb->recovery_lock_file, strerror(errno)));
866 set_close_on_exec(ctdb->recovery_lock_fd);
868 lock.l_type = F_WRLCK;
869 lock.l_whence = SEEK_SET;
874 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
875 close(ctdb->recovery_lock_fd);
876 ctdb->recovery_lock_fd = -1;
878 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
884 close(ctdb->recovery_lock_fd);
885 ctdb->recovery_lock_fd = -1;
889 DEBUG(DEBUG_NOTICE, ("Recovery lock taken successfully\n"));
892 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
898 delete a record as part of the vacuum process
899 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
900 use non-blocking locks
902 return 0 if the record was successfully deleted (i.e. it does not exist
903 when the function returns)
904 or !0 is the record still exists in the tdb after returning.
906 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
909 struct ctdb_ltdb_header *hdr, *hdr2;
911 /* these are really internal tdb functions - but we need them here for
912 non-blocking lock of the freelist */
913 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
914 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
917 key.dsize = rec->keylen;
918 key.dptr = &rec->data[0];
919 data.dsize = rec->datalen;
920 data.dptr = &rec->data[rec->keylen];
922 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
923 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
927 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
928 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
932 hdr = (struct ctdb_ltdb_header *)data.dptr;
934 /* use a non-blocking lock */
935 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
939 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
940 if (data.dptr == NULL) {
941 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
945 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
946 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
947 tdb_delete(ctdb_db->ltdb->tdb, key);
948 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
949 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
951 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
956 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
958 if (hdr2->rsn > hdr->rsn) {
959 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
960 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
961 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
966 /* do not allow deleting record that have readonly flags set. */
967 if (hdr->flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE)) {
968 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
969 DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly flags set\n"));
973 if (hdr2->flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE)) {
974 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
975 DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly flags set\n"));
980 if (hdr2->dmaster == ctdb->pnn) {
981 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
982 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
987 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
988 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
993 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
994 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
995 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
996 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
1001 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
1002 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
1009 struct recovery_callback_state {
1010 struct ctdb_req_control *c;
1015 called when the 'recovered' event script has finished
1017 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
1019 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
1021 ctdb_enable_monitoring(ctdb);
1022 CTDB_INCREMENT_STAT(ctdb, num_recoveries);
1025 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
1026 if (status == -ETIME) {
1027 ctdb_ban_self(ctdb);
1031 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1034 gettimeofday(&ctdb->last_recovery_finished, NULL);
1038 recovery has finished
1040 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
1041 struct ctdb_req_control *c,
1045 struct recovery_callback_state *state;
1047 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
1049 ctdb_persistent_finish_trans3_commits(ctdb);
1051 state = talloc(ctdb, struct recovery_callback_state);
1052 CTDB_NO_MEMORY(ctdb, state);
1056 ctdb_disable_monitoring(ctdb);
1058 ret = ctdb_event_script_callback(ctdb, state,
1059 ctdb_end_recovery_callback,
1062 CTDB_EVENT_RECOVERED, "%s", "");
1065 ctdb_enable_monitoring(ctdb);
1067 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
1072 /* tell the control that we will be reply asynchronously */
1073 state->c = talloc_steal(state, c);
1074 *async_reply = true;
1079 called when the 'startrecovery' event script has finished
1081 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
1083 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
1086 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
1089 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1094 run the startrecovery eventscript
1096 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
1097 struct ctdb_req_control *c,
1101 struct recovery_callback_state *state;
1103 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
1104 gettimeofday(&ctdb->last_recovery_started, NULL);
1106 state = talloc(ctdb, struct recovery_callback_state);
1107 CTDB_NO_MEMORY(ctdb, state);
1109 state->c = talloc_steal(state, c);
1111 ctdb_disable_monitoring(ctdb);
1113 ret = ctdb_event_script_callback(ctdb, state,
1114 ctdb_start_recovery_callback,
1116 CTDB_EVENT_START_RECOVERY,
1120 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1125 /* tell the control that we will be reply asynchronously */
1126 *async_reply = true;
1131 try to delete all these records as part of the vacuuming process
1132 and return the records we failed to delete
1134 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1136 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1137 struct ctdb_db_context *ctdb_db;
1139 struct ctdb_rec_data *rec;
1140 struct ctdb_marshall_buffer *records;
1142 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1143 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1147 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1149 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1154 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1155 reply->count, reply->db_id));
1158 /* create a blob to send back the records we couldnt delete */
1159 records = (struct ctdb_marshall_buffer *)
1160 talloc_zero_size(outdata,
1161 offsetof(struct ctdb_marshall_buffer, data));
1162 if (records == NULL) {
1163 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1166 records->db_id = ctdb_db->db_id;
1169 rec = (struct ctdb_rec_data *)&reply->data[0];
1170 for (i=0;i<reply->count;i++) {
1173 key.dptr = &rec->data[0];
1174 key.dsize = rec->keylen;
1175 data.dptr = &rec->data[key.dsize];
1176 data.dsize = rec->datalen;
1178 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1179 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1183 /* If we cant delete the record we must add it to the reply
1184 so the lmaster knows it may not purge this record
1186 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1188 struct ctdb_ltdb_header *hdr;
1190 hdr = (struct ctdb_ltdb_header *)data.dptr;
1191 data.dptr += sizeof(*hdr);
1192 data.dsize -= sizeof(*hdr);
1194 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1196 old_size = talloc_get_size(records);
1197 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1198 if (records == NULL) {
1199 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1203 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1206 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1210 outdata->dptr = (uint8_t *)records;
1211 outdata->dsize = talloc_get_size(records);
1219 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1221 uint32_t *capabilities = NULL;
1223 capabilities = talloc(outdata, uint32_t);
1224 CTDB_NO_MEMORY(ctdb, capabilities);
1225 *capabilities = ctdb->capabilities;
1227 outdata->dsize = sizeof(uint32_t);
1228 outdata->dptr = (uint8_t *)capabilities;
1233 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1235 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1236 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1238 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1240 if (*count < ctdb->tunable.recd_ping_failcount) {
1242 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1243 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1244 ctdb_recd_ping_timeout, ctdb);
1248 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
1250 ctdb_stop_recoverd(ctdb);
1251 ctdb_start_recoverd(ctdb);
1254 /* The recovery daemon will ping us at regular intervals.
1255 If we havent been pinged for a while we assume the recovery
1256 daemon is inoperable and we shut down.
1258 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1260 talloc_free(ctdb->recd_ping_count);
1262 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1263 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1265 if (ctdb->tunable.recd_ping_timeout != 0) {
1266 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1267 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1268 ctdb_recd_ping_timeout, ctdb);
1276 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1278 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1280 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1285 struct stop_node_callback_state {
1286 struct ctdb_req_control *c;
1290 called when the 'stopped' event script has finished
1292 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1294 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1297 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1298 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1299 if (status == -ETIME) {
1300 ctdb_ban_self(ctdb);
1304 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1308 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1311 struct stop_node_callback_state *state;
1313 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1315 state = talloc(ctdb, struct stop_node_callback_state);
1316 CTDB_NO_MEMORY(ctdb, state);
1318 state->c = talloc_steal(state, c);
1320 ctdb_disable_monitoring(ctdb);
1322 ret = ctdb_event_script_callback(ctdb, state,
1323 ctdb_stop_node_callback,
1325 CTDB_EVENT_STOPPED, "%s", "");
1328 ctdb_enable_monitoring(ctdb);
1330 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1335 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1337 *async_reply = true;
1342 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1344 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1345 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;