4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "lib/util/dlinklist.h"
32 lock all databases - mark only
34 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
36 struct ctdb_db_context *ctdb_db;
38 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
39 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
43 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
44 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
47 /* The dual loop is a woraround for older versions of samba
48 that does not yet support the set-db-priority/lock order
49 call. So that we get basic deadlock avoiidance also for
50 these old versions of samba.
51 This code will be removed in the future.
53 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
54 if (ctdb_db->priority != priority) {
57 if (strstr(ctdb_db->db_name, "notify") != NULL) {
60 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
64 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
65 if (ctdb_db->priority != priority) {
68 if (strstr(ctdb_db->db_name, "notify") == NULL) {
71 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
79 lock all databases - unmark only
81 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
83 struct ctdb_db_context *ctdb_db;
85 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
86 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
90 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
91 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
94 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
95 if (ctdb_db->priority != priority) {
98 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
107 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
109 CHECK_CONTROL_DATA_SIZE(0);
110 struct ctdb_vnn_map_wire *map;
113 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
114 map = talloc_size(outdata, len);
115 CTDB_NO_MEMORY(ctdb, map);
117 map->generation = ctdb->vnn_map->generation;
118 map->size = ctdb->vnn_map->size;
119 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
121 outdata->dsize = len;
122 outdata->dptr = (uint8_t *)map;
128 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
130 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
133 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
134 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
135 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
140 talloc_free(ctdb->vnn_map);
142 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
143 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
145 ctdb->vnn_map->generation = map->generation;
146 ctdb->vnn_map->size = map->size;
147 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
148 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
150 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
156 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
159 struct ctdb_db_context *ctdb_db;
160 struct ctdb_dbid_map *dbid_map;
162 CHECK_CONTROL_DATA_SIZE(0);
165 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
170 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
171 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
172 if (!outdata->dptr) {
173 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
177 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
179 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
180 dbid_map->dbs[i].dbid = ctdb_db->db_id;
181 dbid_map->dbs[i].persistent = ctdb_db->persistent;
188 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
190 uint32_t i, num_nodes;
191 struct ctdb_node_map *node_map;
193 CHECK_CONTROL_DATA_SIZE(0);
195 num_nodes = ctdb->num_nodes;
197 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
198 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
199 if (!outdata->dptr) {
200 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
204 node_map = (struct ctdb_node_map *)outdata->dptr;
205 node_map->num = num_nodes;
206 for (i=0; i<num_nodes; i++) {
207 if (parse_ip(ctdb->nodes[i]->address.address,
208 NULL, /* TODO: pass in the correct interface here*/
210 &node_map->nodes[i].addr) == 0)
212 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
215 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
216 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
223 get an old style ipv4-only nodemap
226 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
228 uint32_t i, num_nodes;
229 struct ctdb_node_mapv4 *node_map;
231 CHECK_CONTROL_DATA_SIZE(0);
233 num_nodes = ctdb->num_nodes;
235 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
236 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
237 if (!outdata->dptr) {
238 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
242 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
243 node_map->num = num_nodes;
244 for (i=0; i<num_nodes; i++) {
245 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
246 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
250 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
251 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
258 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
259 struct timeval t, void *private_data)
262 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
264 struct ctdb_node **nodes;
266 tmp_ctx = talloc_new(ctdb);
268 /* steal the old nodes file for a while */
269 talloc_steal(tmp_ctx, ctdb->nodes);
272 num_nodes = ctdb->num_nodes;
275 /* load the new nodes file */
276 ctdb_load_nodes_file(ctdb);
278 for (i=0; i<ctdb->num_nodes; i++) {
279 /* keep any identical pre-existing nodes and connections */
280 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
281 talloc_free(ctdb->nodes[i]);
282 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
286 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
290 /* any new or different nodes must be added */
291 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
292 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
293 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
295 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
296 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
297 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
301 /* tell the recovery daemon to reaload the nodes file too */
302 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
304 talloc_free(tmp_ctx);
309 reload the nodes file after a short delay (so that we can send the response
313 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
315 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
321 a traverse function for pulling all relevent records from pulldb
324 struct ctdb_context *ctdb;
325 struct ctdb_marshall_buffer *pulldata;
330 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
332 struct pulldb_data *params = (struct pulldb_data *)p;
333 struct ctdb_rec_data *rec;
335 /* add the record to the blob */
336 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
338 params->failed = true;
341 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
342 if (params->pulldata == NULL) {
343 DEBUG(DEBUG_ERR,(__location__ " Failed to expand pulldb_data to %u (%u records)\n",
344 rec->length + params->len, params->pulldata->count));
345 params->failed = true;
348 params->pulldata->count++;
349 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
350 params->len += rec->length;
357 pul a bunch of records from a ltdb, filtering by lmaster
359 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
361 struct ctdb_control_pulldb *pull;
362 struct ctdb_db_context *ctdb_db;
363 struct pulldb_data params;
364 struct ctdb_marshall_buffer *reply;
366 pull = (struct ctdb_control_pulldb *)indata.dptr;
368 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
370 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
374 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
375 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
379 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
380 CTDB_NO_MEMORY(ctdb, reply);
382 reply->db_id = pull->db_id;
385 params.pulldata = reply;
386 params.len = offsetof(struct ctdb_marshall_buffer, data);
387 params.failed = false;
389 if (ctdb_db->unhealthy_reason) {
390 /* this is just a warning, as the tdb should be empty anyway */
391 DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
392 ctdb_db->db_name, ctdb_db->unhealthy_reason));
395 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
396 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
400 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
401 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
402 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
403 talloc_free(params.pulldata);
407 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
409 outdata->dptr = (uint8_t *)params.pulldata;
410 outdata->dsize = params.len;
416 push a bunch of records into a ltdb, filtering by rsn
418 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
420 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
421 struct ctdb_db_context *ctdb_db;
423 struct ctdb_rec_data *rec;
425 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
426 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
430 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
432 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
436 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
437 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
441 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
442 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
446 rec = (struct ctdb_rec_data *)&reply->data[0];
448 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
449 reply->count, reply->db_id));
451 for (i=0;i<reply->count;i++) {
453 struct ctdb_ltdb_header *hdr;
455 key.dptr = &rec->data[0];
456 key.dsize = rec->keylen;
457 data.dptr = &rec->data[key.dsize];
458 data.dsize = rec->datalen;
460 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
461 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
464 hdr = (struct ctdb_ltdb_header *)data.dptr;
465 data.dptr += sizeof(*hdr);
466 data.dsize -= sizeof(*hdr);
468 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
470 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
474 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
477 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
478 reply->count, reply->db_id));
480 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
484 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
489 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
491 uint32_t *dmaster = (uint32_t *)p;
492 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
495 /* skip if already correct */
496 if (header->dmaster == *dmaster) {
500 header->dmaster = *dmaster;
502 ret = tdb_store(tdb, key, data, TDB_REPLACE);
504 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
508 /* TODO: add error checking here */
513 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
515 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
516 struct ctdb_db_context *ctdb_db;
518 ctdb_db = find_ctdb_db(ctdb, p->db_id);
520 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
524 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
525 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
529 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
530 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
534 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
536 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
541 struct ctdb_set_recmode_state {
542 struct ctdb_context *ctdb;
543 struct ctdb_req_control *c;
546 struct timed_event *te;
547 struct fd_event *fde;
549 struct timeval start_time;
553 called if our set_recmode child times out. this would happen if
554 ctdb_recovery_lock() would block.
556 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
557 struct timeval t, void *private_data)
559 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
560 struct ctdb_set_recmode_state);
562 /* we consider this a success, not a failure, as we failed to
563 set the recovery lock which is what we wanted. This can be
564 caused by the cluster filesystem being very slow to
565 arbitrate locks immediately after a node failure.
567 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
568 state->ctdb->recovery_mode = state->recmode;
569 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
574 /* when we free the recmode state we must kill any child process.
576 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
578 double l = timeval_elapsed(&state->start_time);
580 ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
582 if (state->fd[0] != -1) {
585 if (state->fd[1] != -1) {
588 kill(state->child, SIGKILL);
592 /* this is called when the client process has completed ctdb_recovery_lock()
593 and has written data back to us through the pipe.
595 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
596 uint16_t flags, void *private_data)
598 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
599 struct ctdb_set_recmode_state);
603 /* we got a response from our child process so we can abort the
606 talloc_free(state->te);
610 /* read the childs status when trying to lock the reclock file.
611 child wrote 0 if everything is fine and 1 if it did manage
612 to lock the file, which would be a problem since that means
613 we got a request to exit from recovery but we could still lock
614 the file which at this time SHOULD be locked by the recovery
615 daemon on the recmaster
617 ret = read(state->fd[0], &c, 1);
618 if (ret != 1 || c != 0) {
619 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
624 state->ctdb->recovery_mode = state->recmode;
626 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
632 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
633 struct timeval t, void *private_data)
635 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
637 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
638 talloc_free(ctdb->release_ips_ctx);
639 ctdb->release_ips_ctx = NULL;
641 ctdb_release_all_ips(ctdb);
645 set the recovery mode
647 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
648 struct ctdb_req_control *c,
649 TDB_DATA indata, bool *async_reply,
650 const char **errormsg)
652 uint32_t recmode = *(uint32_t *)indata.dptr;
654 struct ctdb_set_recmode_state *state;
655 pid_t parent = getpid();
657 /* if we enter recovery but stay in recovery for too long
658 we will eventually drop all our ip addresses
660 if (recmode == CTDB_RECOVERY_NORMAL) {
661 talloc_free(ctdb->release_ips_ctx);
662 ctdb->release_ips_ctx = NULL;
664 talloc_free(ctdb->release_ips_ctx);
665 ctdb->release_ips_ctx = talloc_new(ctdb);
666 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
668 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
671 if (recmode != ctdb->recovery_mode) {
672 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
673 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
676 if (recmode != CTDB_RECOVERY_NORMAL ||
677 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
678 ctdb->recovery_mode = recmode;
682 /* some special handling when ending recovery mode */
684 /* force the databases to thaw */
685 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
686 if (ctdb->freeze_handles[i] != NULL) {
687 ctdb_control_thaw(ctdb, i);
691 state = talloc(ctdb, struct ctdb_set_recmode_state);
692 CTDB_NO_MEMORY(ctdb, state);
694 state->start_time = timeval_current();
698 if (ctdb->tunable.verify_recovery_lock == 0) {
699 /* dont need to verify the reclock file */
700 ctdb->recovery_mode = recmode;
704 /* For the rest of what needs to be done, we need to do this in
705 a child process since
706 1, the call to ctdb_recovery_lock() can block if the cluster
707 filesystem is in the process of recovery.
709 ret = pipe(state->fd);
712 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
716 state->child = fork();
717 if (state->child == (pid_t)-1) {
724 if (state->child == 0) {
728 debug_extra = talloc_asprintf(NULL, "set_recmode:");
729 /* we should not be able to get the lock on the reclock file,
730 as it should be held by the recovery master
732 if (ctdb_recovery_lock(ctdb, false)) {
733 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
737 write(state->fd[1], &cc, 1);
738 /* make sure we die when our parent dies */
739 while (kill(parent, 0) == 0 || errno != ESRCH) {
741 write(state->fd[1], &cc, 1);
746 set_close_on_exec(state->fd[0]);
750 talloc_set_destructor(state, set_recmode_destructor);
752 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
754 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
755 ctdb_set_recmode_timeout, state);
757 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
762 if (state->fde == NULL) {
766 tevent_fd_set_auto_close(state->fde);
769 state->recmode = recmode;
770 state->c = talloc_steal(state, c);
779 try and get the recovery lock in shared storage - should only work
780 on the recovery master recovery daemon. Anywhere else is a bug
782 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
787 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
789 if (ctdb->recovery_lock_fd != -1) {
790 close(ctdb->recovery_lock_fd);
791 ctdb->recovery_lock_fd = -1;
794 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
795 if (ctdb->recovery_lock_fd == -1) {
796 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
797 ctdb->recovery_lock_file, strerror(errno)));
801 set_close_on_exec(ctdb->recovery_lock_fd);
803 lock.l_type = F_WRLCK;
804 lock.l_whence = SEEK_SET;
809 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
810 close(ctdb->recovery_lock_fd);
811 ctdb->recovery_lock_fd = -1;
813 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
819 close(ctdb->recovery_lock_fd);
820 ctdb->recovery_lock_fd = -1;
824 DEBUG(DEBUG_NOTICE, ("Recovery lock taken successfully\n"));
827 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
833 delete a record as part of the vacuum process
834 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
835 use non-blocking locks
837 return 0 if the record was successfully deleted (i.e. it does not exist
838 when the function returns)
839 or !0 is the record still exists in the tdb after returning.
841 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
844 struct ctdb_ltdb_header *hdr, *hdr2;
846 /* these are really internal tdb functions - but we need them here for
847 non-blocking lock of the freelist */
848 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
849 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
852 key.dsize = rec->keylen;
853 key.dptr = &rec->data[0];
854 data.dsize = rec->datalen;
855 data.dptr = &rec->data[rec->keylen];
857 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
858 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
862 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
863 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
867 hdr = (struct ctdb_ltdb_header *)data.dptr;
869 /* use a non-blocking lock */
870 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
874 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
875 if (data.dptr == NULL) {
876 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
880 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
881 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
882 tdb_delete(ctdb_db->ltdb->tdb, key);
883 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
884 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
886 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
891 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
893 if (hdr2->rsn > hdr->rsn) {
894 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
895 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
896 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
901 if (hdr2->dmaster == ctdb->pnn) {
902 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
903 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
908 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
909 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
914 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
915 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
916 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
917 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
922 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
923 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
930 struct recovery_callback_state {
931 struct ctdb_req_control *c;
936 called when the 'recovered' event script has finished
938 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
940 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
942 ctdb_enable_monitoring(ctdb);
943 ctdb->statistics.num_recoveries++;
946 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
947 if (status == -ETIME) {
952 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
955 gettimeofday(&ctdb->last_recovery_finished, NULL);
959 recovery has finished
961 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
962 struct ctdb_req_control *c,
966 struct recovery_callback_state *state;
968 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
970 state = talloc(ctdb, struct recovery_callback_state);
971 CTDB_NO_MEMORY(ctdb, state);
975 ctdb_disable_monitoring(ctdb);
977 ret = ctdb_event_script_callback(ctdb, state,
978 ctdb_end_recovery_callback,
981 CTDB_EVENT_RECOVERED, "%s", "");
984 ctdb_enable_monitoring(ctdb);
986 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
991 /* tell the control that we will be reply asynchronously */
992 state->c = talloc_steal(state, c);
998 called when the 'startrecovery' event script has finished
1000 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
1002 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
1005 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
1008 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1013 run the startrecovery eventscript
1015 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
1016 struct ctdb_req_control *c,
1020 struct recovery_callback_state *state;
1022 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
1023 gettimeofday(&ctdb->last_recovery_started, NULL);
1025 state = talloc(ctdb, struct recovery_callback_state);
1026 CTDB_NO_MEMORY(ctdb, state);
1028 state->c = talloc_steal(state, c);
1030 ctdb_disable_monitoring(ctdb);
1032 ret = ctdb_event_script_callback(ctdb, state,
1033 ctdb_start_recovery_callback,
1035 CTDB_EVENT_START_RECOVERY,
1039 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1044 /* tell the control that we will be reply asynchronously */
1045 *async_reply = true;
1050 try to delete all these records as part of the vacuuming process
1051 and return the records we failed to delete
1053 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1055 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1056 struct ctdb_db_context *ctdb_db;
1058 struct ctdb_rec_data *rec;
1059 struct ctdb_marshall_buffer *records;
1061 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1062 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1066 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1068 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1073 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1074 reply->count, reply->db_id));
1077 /* create a blob to send back the records we couldnt delete */
1078 records = (struct ctdb_marshall_buffer *)
1079 talloc_zero_size(outdata,
1080 offsetof(struct ctdb_marshall_buffer, data));
1081 if (records == NULL) {
1082 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1085 records->db_id = ctdb_db->db_id;
1088 rec = (struct ctdb_rec_data *)&reply->data[0];
1089 for (i=0;i<reply->count;i++) {
1092 key.dptr = &rec->data[0];
1093 key.dsize = rec->keylen;
1094 data.dptr = &rec->data[key.dsize];
1095 data.dsize = rec->datalen;
1097 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1098 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1102 /* If we cant delete the record we must add it to the reply
1103 so the lmaster knows it may not purge this record
1105 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1107 struct ctdb_ltdb_header *hdr;
1109 hdr = (struct ctdb_ltdb_header *)data.dptr;
1110 data.dptr += sizeof(*hdr);
1111 data.dsize -= sizeof(*hdr);
1113 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1115 old_size = talloc_get_size(records);
1116 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1117 if (records == NULL) {
1118 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1122 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1125 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1129 outdata->dptr = (uint8_t *)records;
1130 outdata->dsize = talloc_get_size(records);
1138 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1140 uint32_t *capabilities = NULL;
1142 capabilities = talloc(outdata, uint32_t);
1143 CTDB_NO_MEMORY(ctdb, capabilities);
1144 *capabilities = ctdb->capabilities;
1146 outdata->dsize = sizeof(uint32_t);
1147 outdata->dptr = (uint8_t *)capabilities;
1152 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1154 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1155 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1157 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1159 if (*count < ctdb->tunable.recd_ping_failcount) {
1161 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1162 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1163 ctdb_recd_ping_timeout, ctdb);
1167 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Shutting down ctdb daemon. (This can be caused if the cluster filesystem has hung)\n"));
1169 ctdb_stop_recoverd(ctdb);
1170 ctdb_stop_keepalive(ctdb);
1171 ctdb_stop_monitoring(ctdb);
1172 ctdb_release_all_ips(ctdb);
1173 if (ctdb->methods != NULL) {
1174 ctdb->methods->shutdown(ctdb);
1176 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
1177 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Daemon has been shut down.\n"));
1181 /* The recovery daemon will ping us at regular intervals.
1182 If we havent been pinged for a while we assume the recovery
1183 daemon is inoperable and we shut down.
1185 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1187 talloc_free(ctdb->recd_ping_count);
1189 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1190 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1192 if (ctdb->tunable.recd_ping_timeout != 0) {
1193 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1194 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1195 ctdb_recd_ping_timeout, ctdb);
1203 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1205 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1207 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1212 struct stop_node_callback_state {
1213 struct ctdb_req_control *c;
1217 called when the 'stopped' event script has finished
1219 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1221 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1224 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1225 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1226 if (status == -ETIME) {
1227 ctdb_ban_self(ctdb);
1231 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1235 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1238 struct stop_node_callback_state *state;
1240 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1242 state = talloc(ctdb, struct stop_node_callback_state);
1243 CTDB_NO_MEMORY(ctdb, state);
1245 state->c = talloc_steal(state, c);
1247 ctdb_disable_monitoring(ctdb);
1249 ret = ctdb_event_script_callback(ctdb, state,
1250 ctdb_stop_node_callback,
1252 CTDB_EVENT_STOPPED, "%s", "");
1255 ctdb_enable_monitoring(ctdb);
1257 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1262 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1264 *async_reply = true;
1269 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1271 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1272 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;