4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "lib/util/dlinklist.h"
32 lock all databases - mark only
34 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
36 struct ctdb_db_context *ctdb_db;
38 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
39 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
43 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
44 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
47 /* The dual loop is a woraround for older versions of samba
48 that does not yet support the set-db-priority/lock order
49 call. So that we get basic deadlock avoiidance also for
50 these old versions of samba.
51 This code will be removed in the future.
53 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
54 if (ctdb_db->priority != priority) {
57 if (strstr(ctdb_db->db_name, "notify") != NULL) {
60 if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
63 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
64 tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb);
68 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
69 if (ctdb_db->priority != priority) {
72 if (strstr(ctdb_db->db_name, "notify") == NULL) {
75 if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
78 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
79 tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb);
87 lock all databases - unmark only
89 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
91 struct ctdb_db_context *ctdb_db;
93 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
94 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
98 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
99 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
102 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
103 if (ctdb_db->priority != priority) {
106 tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb);
107 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
116 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
118 CHECK_CONTROL_DATA_SIZE(0);
119 struct ctdb_vnn_map_wire *map;
122 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
123 map = talloc_size(outdata, len);
124 CTDB_NO_MEMORY(ctdb, map);
126 map->generation = ctdb->vnn_map->generation;
127 map->size = ctdb->vnn_map->size;
128 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
130 outdata->dsize = len;
131 outdata->dptr = (uint8_t *)map;
137 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
139 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
142 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
143 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
144 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
149 talloc_free(ctdb->vnn_map);
151 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
152 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
154 ctdb->vnn_map->generation = map->generation;
155 ctdb->vnn_map->size = map->size;
156 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
157 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
159 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
165 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
168 struct ctdb_db_context *ctdb_db;
169 struct ctdb_dbid_map *dbid_map;
171 CHECK_CONTROL_DATA_SIZE(0);
174 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
179 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
180 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
181 if (!outdata->dptr) {
182 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
186 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
188 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
189 dbid_map->dbs[i].dbid = ctdb_db->db_id;
190 if (ctdb_db->persistent != 0) {
191 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_PERSISTENT;
193 if (ctdb_db->readonly != 0) {
194 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_READONLY;
202 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
204 uint32_t i, num_nodes;
205 struct ctdb_node_map *node_map;
207 CHECK_CONTROL_DATA_SIZE(0);
209 num_nodes = ctdb->num_nodes;
211 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
212 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
213 if (!outdata->dptr) {
214 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
218 node_map = (struct ctdb_node_map *)outdata->dptr;
219 node_map->num = num_nodes;
220 for (i=0; i<num_nodes; i++) {
221 if (parse_ip(ctdb->nodes[i]->address.address,
222 NULL, /* TODO: pass in the correct interface here*/
224 &node_map->nodes[i].addr) == 0)
226 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
229 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
230 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
237 get an old style ipv4-only nodemap
240 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
242 uint32_t i, num_nodes;
243 struct ctdb_node_mapv4 *node_map;
245 CHECK_CONTROL_DATA_SIZE(0);
247 num_nodes = ctdb->num_nodes;
249 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
250 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
251 if (!outdata->dptr) {
252 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
256 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
257 node_map->num = num_nodes;
258 for (i=0; i<num_nodes; i++) {
259 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
260 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
264 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
265 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
272 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
273 struct timeval t, void *private_data)
276 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
278 struct ctdb_node **nodes;
280 tmp_ctx = talloc_new(ctdb);
282 /* steal the old nodes file for a while */
283 talloc_steal(tmp_ctx, ctdb->nodes);
286 num_nodes = ctdb->num_nodes;
289 /* load the new nodes file */
290 ctdb_load_nodes_file(ctdb);
292 for (i=0; i<ctdb->num_nodes; i++) {
293 /* keep any identical pre-existing nodes and connections */
294 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
295 talloc_free(ctdb->nodes[i]);
296 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
300 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
304 /* any new or different nodes must be added */
305 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
306 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
307 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
309 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
310 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
311 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
315 /* tell the recovery daemon to reaload the nodes file too */
316 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
318 talloc_free(tmp_ctx);
323 reload the nodes file after a short delay (so that we can send the response
327 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
329 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
335 a traverse function for pulling all relevent records from pulldb
338 struct ctdb_context *ctdb;
339 struct ctdb_marshall_buffer *pulldata;
344 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
346 struct pulldb_data *params = (struct pulldb_data *)p;
347 struct ctdb_rec_data *rec;
349 /* add the record to the blob */
350 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
352 params->failed = true;
355 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
356 if (params->pulldata == NULL) {
357 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len));
358 ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n");
360 params->pulldata->count++;
361 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
362 params->len += rec->length;
369 pul a bunch of records from a ltdb, filtering by lmaster
371 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
373 struct ctdb_control_pulldb *pull;
374 struct ctdb_db_context *ctdb_db;
375 struct pulldb_data params;
376 struct ctdb_marshall_buffer *reply;
378 pull = (struct ctdb_control_pulldb *)indata.dptr;
380 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
382 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
386 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
387 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
391 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
392 CTDB_NO_MEMORY(ctdb, reply);
394 reply->db_id = pull->db_id;
397 params.pulldata = reply;
398 params.len = offsetof(struct ctdb_marshall_buffer, data);
399 params.failed = false;
401 if (ctdb_db->unhealthy_reason) {
402 /* this is just a warning, as the tdb should be empty anyway */
403 DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
404 ctdb_db->db_name, ctdb_db->unhealthy_reason));
407 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
408 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
412 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
413 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
414 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
415 talloc_free(params.pulldata);
419 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
421 outdata->dptr = (uint8_t *)params.pulldata;
422 outdata->dsize = params.len;
428 push a bunch of records into a ltdb, filtering by rsn
430 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
432 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
433 struct ctdb_db_context *ctdb_db;
435 struct ctdb_rec_data *rec;
437 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
438 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
442 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
444 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
448 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
449 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
453 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
454 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
458 rec = (struct ctdb_rec_data *)&reply->data[0];
460 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
461 reply->count, reply->db_id));
463 for (i=0;i<reply->count;i++) {
465 struct ctdb_ltdb_header *hdr;
467 key.dptr = &rec->data[0];
468 key.dsize = rec->keylen;
469 data.dptr = &rec->data[key.dsize];
470 data.dsize = rec->datalen;
472 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
473 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
476 hdr = (struct ctdb_ltdb_header *)data.dptr;
477 /* strip off any read only record flags. All readonly records
478 are revoked implicitely by a recovery
480 hdr->flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
482 data.dptr += sizeof(*hdr);
483 data.dsize -= sizeof(*hdr);
485 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
487 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
491 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
494 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
495 reply->count, reply->db_id));
497 if (ctdb_db->readonly) {
498 DEBUG(DEBUG_CRIT,("Clearing the tracking database for dbid 0x%x\n",
500 if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
501 DEBUG(DEBUG_ERR,("Failed to wipe tracking database for 0x%x. Dropping read-only delegation support\n", ctdb_db->db_id));
502 ctdb_db->readonly = false;
503 tdb_close(ctdb_db->rottdb);
504 ctdb_db->rottdb = NULL;
505 ctdb_db->readonly = false;
507 while (ctdb_db->revokechild_active != NULL) {
508 talloc_free(ctdb_db->revokechild_active);
509 ctdb_db->revokechild_active = NULL;
513 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
517 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
522 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
524 uint32_t *dmaster = (uint32_t *)p;
525 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
528 /* skip if already correct */
529 if (header->dmaster == *dmaster) {
533 header->dmaster = *dmaster;
535 ret = tdb_store(tdb, key, data, TDB_REPLACE);
537 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
541 /* TODO: add error checking here */
546 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
548 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
549 struct ctdb_db_context *ctdb_db;
551 ctdb_db = find_ctdb_db(ctdb, p->db_id);
553 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
557 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
558 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
562 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
563 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
567 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
569 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
574 struct ctdb_set_recmode_state {
575 struct ctdb_context *ctdb;
576 struct ctdb_req_control *c;
579 struct timed_event *te;
580 struct fd_event *fde;
582 struct timeval start_time;
586 called if our set_recmode child times out. this would happen if
587 ctdb_recovery_lock() would block.
589 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
590 struct timeval t, void *private_data)
592 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
593 struct ctdb_set_recmode_state);
595 /* we consider this a success, not a failure, as we failed to
596 set the recovery lock which is what we wanted. This can be
597 caused by the cluster filesystem being very slow to
598 arbitrate locks immediately after a node failure.
600 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
601 state->ctdb->recovery_mode = state->recmode;
602 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
607 /* when we free the recmode state we must kill any child process.
609 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
611 double l = timeval_elapsed(&state->start_time);
613 CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", reclock.ctdbd, l);
615 if (state->fd[0] != -1) {
618 if (state->fd[1] != -1) {
621 kill(state->child, SIGKILL);
625 /* this is called when the client process has completed ctdb_recovery_lock()
626 and has written data back to us through the pipe.
628 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
629 uint16_t flags, void *private_data)
631 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
632 struct ctdb_set_recmode_state);
636 /* we got a response from our child process so we can abort the
639 talloc_free(state->te);
643 /* read the childs status when trying to lock the reclock file.
644 child wrote 0 if everything is fine and 1 if it did manage
645 to lock the file, which would be a problem since that means
646 we got a request to exit from recovery but we could still lock
647 the file which at this time SHOULD be locked by the recovery
648 daemon on the recmaster
650 ret = read(state->fd[0], &c, 1);
651 if (ret != 1 || c != 0) {
652 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
657 state->ctdb->recovery_mode = state->recmode;
659 /* release any deferred attach calls from clients */
660 if (state->recmode == CTDB_RECOVERY_NORMAL) {
661 ctdb_process_deferred_attach(state->ctdb);
664 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
670 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
671 struct timeval t, void *private_data)
673 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
675 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
676 talloc_free(ctdb->release_ips_ctx);
677 ctdb->release_ips_ctx = NULL;
679 ctdb_release_all_ips(ctdb);
683 * Set up an event to drop all public ips if we remain in recovery for too
686 int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
688 if (ctdb->release_ips_ctx != NULL) {
689 talloc_free(ctdb->release_ips_ctx);
691 ctdb->release_ips_ctx = talloc_new(ctdb);
692 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
694 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
699 set the recovery mode
701 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
702 struct ctdb_req_control *c,
703 TDB_DATA indata, bool *async_reply,
704 const char **errormsg)
706 uint32_t recmode = *(uint32_t *)indata.dptr;
708 struct ctdb_set_recmode_state *state;
709 pid_t parent = getpid();
711 /* if we enter recovery but stay in recovery for too long
712 we will eventually drop all our ip addresses
714 if (recmode == CTDB_RECOVERY_NORMAL) {
715 talloc_free(ctdb->release_ips_ctx);
716 ctdb->release_ips_ctx = NULL;
718 if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
719 DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n"));
723 if (recmode != ctdb->recovery_mode) {
724 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
725 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
728 if (recmode != CTDB_RECOVERY_NORMAL ||
729 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
730 ctdb->recovery_mode = recmode;
734 /* some special handling when ending recovery mode */
736 /* force the databases to thaw */
737 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
738 if (ctdb->freeze_handles[i] != NULL) {
739 ctdb_control_thaw(ctdb, i);
743 state = talloc(ctdb, struct ctdb_set_recmode_state);
744 CTDB_NO_MEMORY(ctdb, state);
746 state->start_time = timeval_current();
750 /* release any deferred attach calls from clients */
751 if (recmode == CTDB_RECOVERY_NORMAL) {
752 ctdb_process_deferred_attach(ctdb);
755 if (ctdb->tunable.verify_recovery_lock == 0) {
756 /* dont need to verify the reclock file */
757 ctdb->recovery_mode = recmode;
761 /* For the rest of what needs to be done, we need to do this in
762 a child process since
763 1, the call to ctdb_recovery_lock() can block if the cluster
764 filesystem is in the process of recovery.
766 ret = pipe(state->fd);
769 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
773 state->child = fork();
774 if (state->child == (pid_t)-1) {
781 if (state->child == 0) {
785 debug_extra = talloc_asprintf(NULL, "set_recmode:");
786 /* we should not be able to get the lock on the reclock file,
787 as it should be held by the recovery master
789 if (ctdb_recovery_lock(ctdb, false)) {
790 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
794 write(state->fd[1], &cc, 1);
795 /* make sure we die when our parent dies */
796 while (kill(parent, 0) == 0 || errno != ESRCH) {
798 write(state->fd[1], &cc, 1);
803 set_close_on_exec(state->fd[0]);
807 talloc_set_destructor(state, set_recmode_destructor);
809 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
811 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
812 ctdb_set_recmode_timeout, state);
814 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
819 if (state->fde == NULL) {
823 tevent_fd_set_auto_close(state->fde);
826 state->recmode = recmode;
827 state->c = talloc_steal(state, c);
836 try and get the recovery lock in shared storage - should only work
837 on the recovery master recovery daemon. Anywhere else is a bug
839 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
844 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
846 if (ctdb->recovery_lock_fd != -1) {
847 close(ctdb->recovery_lock_fd);
848 ctdb->recovery_lock_fd = -1;
851 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
852 if (ctdb->recovery_lock_fd == -1) {
853 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
854 ctdb->recovery_lock_file, strerror(errno)));
858 set_close_on_exec(ctdb->recovery_lock_fd);
860 lock.l_type = F_WRLCK;
861 lock.l_whence = SEEK_SET;
866 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
867 close(ctdb->recovery_lock_fd);
868 ctdb->recovery_lock_fd = -1;
870 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
876 close(ctdb->recovery_lock_fd);
877 ctdb->recovery_lock_fd = -1;
881 DEBUG(DEBUG_NOTICE, ("Recovery lock taken successfully\n"));
884 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
890 delete a record as part of the vacuum process
891 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
892 use non-blocking locks
894 return 0 if the record was successfully deleted (i.e. it does not exist
895 when the function returns)
896 or !0 is the record still exists in the tdb after returning.
898 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
901 struct ctdb_ltdb_header *hdr, *hdr2;
903 /* these are really internal tdb functions - but we need them here for
904 non-blocking lock of the freelist */
905 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
906 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
909 key.dsize = rec->keylen;
910 key.dptr = &rec->data[0];
911 data.dsize = rec->datalen;
912 data.dptr = &rec->data[rec->keylen];
914 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
915 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
919 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
920 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
924 hdr = (struct ctdb_ltdb_header *)data.dptr;
926 /* use a non-blocking lock */
927 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
931 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
932 if (data.dptr == NULL) {
933 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
937 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
938 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
939 tdb_delete(ctdb_db->ltdb->tdb, key);
940 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
941 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
943 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
948 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
950 if (hdr2->rsn > hdr->rsn) {
951 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
952 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
953 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
958 if (hdr2->dmaster == ctdb->pnn) {
959 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
960 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
965 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
966 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
971 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
972 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
973 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
974 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
979 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
980 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
987 struct recovery_callback_state {
988 struct ctdb_req_control *c;
993 called when the 'recovered' event script has finished
995 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
997 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
999 ctdb_enable_monitoring(ctdb);
1000 CTDB_INCREMENT_STAT(ctdb, num_recoveries);
1003 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
1004 if (status == -ETIME) {
1005 ctdb_ban_self(ctdb);
1009 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1012 gettimeofday(&ctdb->last_recovery_finished, NULL);
1016 recovery has finished
1018 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
1019 struct ctdb_req_control *c,
1023 struct recovery_callback_state *state;
1025 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
1027 ctdb_persistent_finish_trans3_commits(ctdb);
1029 state = talloc(ctdb, struct recovery_callback_state);
1030 CTDB_NO_MEMORY(ctdb, state);
1034 ctdb_disable_monitoring(ctdb);
1036 ret = ctdb_event_script_callback(ctdb, state,
1037 ctdb_end_recovery_callback,
1040 CTDB_EVENT_RECOVERED, "%s", "");
1043 ctdb_enable_monitoring(ctdb);
1045 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
1050 /* tell the control that we will be reply asynchronously */
1051 state->c = talloc_steal(state, c);
1052 *async_reply = true;
1057 called when the 'startrecovery' event script has finished
1059 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
1061 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
1064 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
1067 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1072 run the startrecovery eventscript
1074 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
1075 struct ctdb_req_control *c,
1079 struct recovery_callback_state *state;
1081 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
1082 gettimeofday(&ctdb->last_recovery_started, NULL);
1084 state = talloc(ctdb, struct recovery_callback_state);
1085 CTDB_NO_MEMORY(ctdb, state);
1087 state->c = talloc_steal(state, c);
1089 ctdb_disable_monitoring(ctdb);
1091 ret = ctdb_event_script_callback(ctdb, state,
1092 ctdb_start_recovery_callback,
1094 CTDB_EVENT_START_RECOVERY,
1098 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1103 /* tell the control that we will be reply asynchronously */
1104 *async_reply = true;
1109 try to delete all these records as part of the vacuuming process
1110 and return the records we failed to delete
1112 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1114 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1115 struct ctdb_db_context *ctdb_db;
1117 struct ctdb_rec_data *rec;
1118 struct ctdb_marshall_buffer *records;
1120 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1121 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1125 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1127 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1132 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1133 reply->count, reply->db_id));
1136 /* create a blob to send back the records we couldnt delete */
1137 records = (struct ctdb_marshall_buffer *)
1138 talloc_zero_size(outdata,
1139 offsetof(struct ctdb_marshall_buffer, data));
1140 if (records == NULL) {
1141 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1144 records->db_id = ctdb_db->db_id;
1147 rec = (struct ctdb_rec_data *)&reply->data[0];
1148 for (i=0;i<reply->count;i++) {
1151 key.dptr = &rec->data[0];
1152 key.dsize = rec->keylen;
1153 data.dptr = &rec->data[key.dsize];
1154 data.dsize = rec->datalen;
1156 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1157 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1161 /* If we cant delete the record we must add it to the reply
1162 so the lmaster knows it may not purge this record
1164 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1166 struct ctdb_ltdb_header *hdr;
1168 hdr = (struct ctdb_ltdb_header *)data.dptr;
1169 data.dptr += sizeof(*hdr);
1170 data.dsize -= sizeof(*hdr);
1172 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1174 old_size = talloc_get_size(records);
1175 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1176 if (records == NULL) {
1177 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1181 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1184 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1188 outdata->dptr = (uint8_t *)records;
1189 outdata->dsize = talloc_get_size(records);
1197 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1199 uint32_t *capabilities = NULL;
1201 capabilities = talloc(outdata, uint32_t);
1202 CTDB_NO_MEMORY(ctdb, capabilities);
1203 *capabilities = ctdb->capabilities;
1205 outdata->dsize = sizeof(uint32_t);
1206 outdata->dptr = (uint8_t *)capabilities;
1211 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1213 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1214 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1216 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1218 if (*count < ctdb->tunable.recd_ping_failcount) {
1220 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1221 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1222 ctdb_recd_ping_timeout, ctdb);
1226 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
1228 ctdb_stop_recoverd(ctdb);
1229 ctdb_start_recoverd(ctdb);
1232 /* The recovery daemon will ping us at regular intervals.
1233 If we havent been pinged for a while we assume the recovery
1234 daemon is inoperable and we shut down.
1236 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1238 talloc_free(ctdb->recd_ping_count);
1240 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1241 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1243 if (ctdb->tunable.recd_ping_timeout != 0) {
1244 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1245 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1246 ctdb_recd_ping_timeout, ctdb);
1254 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1256 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1258 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1263 struct stop_node_callback_state {
1264 struct ctdb_req_control *c;
1268 called when the 'stopped' event script has finished
1270 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1272 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1275 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1276 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1277 if (status == -ETIME) {
1278 ctdb_ban_self(ctdb);
1282 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1286 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1289 struct stop_node_callback_state *state;
1291 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1293 state = talloc(ctdb, struct stop_node_callback_state);
1294 CTDB_NO_MEMORY(ctdb, state);
1296 state->c = talloc_steal(state, c);
1298 ctdb_disable_monitoring(ctdb);
1300 ret = ctdb_event_script_callback(ctdb, state,
1301 ctdb_stop_node_callback,
1303 CTDB_EVENT_STOPPED, "%s", "");
1306 ctdb_enable_monitoring(ctdb);
1308 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1313 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1315 *async_reply = true;
1320 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1322 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1323 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;