4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "lib/util/dlinklist.h"
32 lock all databases - mark only
34 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
36 struct ctdb_db_context *ctdb_db;
38 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
39 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
43 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
44 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
47 /* The dual loop is a woraround for older versions of samba
48 that does not yet support the set-db-priority/lock order
49 call. So that we get basic deadlock avoiidance also for
50 these old versions of samba.
51 This code will be removed in the future.
53 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
54 if (ctdb_db->priority != priority) {
57 if (strstr(ctdb_db->db_name, "notify") != NULL) {
60 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
64 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
65 if (ctdb_db->priority != priority) {
68 if (strstr(ctdb_db->db_name, "notify") == NULL) {
71 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
79 lock all databases - unmark only
81 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
83 struct ctdb_db_context *ctdb_db;
85 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
86 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
90 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
91 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
94 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
95 if (ctdb_db->priority != priority) {
98 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
107 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
109 CHECK_CONTROL_DATA_SIZE(0);
110 struct ctdb_vnn_map_wire *map;
113 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
114 map = talloc_size(outdata, len);
115 CTDB_NO_MEMORY(ctdb, map);
117 map->generation = ctdb->vnn_map->generation;
118 map->size = ctdb->vnn_map->size;
119 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
121 outdata->dsize = len;
122 outdata->dptr = (uint8_t *)map;
128 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
130 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
133 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
134 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
135 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
140 talloc_free(ctdb->vnn_map);
142 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
143 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
145 ctdb->vnn_map->generation = map->generation;
146 ctdb->vnn_map->size = map->size;
147 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
148 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
150 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
156 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
159 struct ctdb_db_context *ctdb_db;
160 struct ctdb_dbid_map *dbid_map;
162 CHECK_CONTROL_DATA_SIZE(0);
165 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
170 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
171 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
172 if (!outdata->dptr) {
173 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
177 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
179 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
180 dbid_map->dbs[i].dbid = ctdb_db->db_id;
181 dbid_map->dbs[i].persistent = ctdb_db->persistent;
188 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
190 uint32_t i, num_nodes;
191 struct ctdb_node_map *node_map;
193 CHECK_CONTROL_DATA_SIZE(0);
195 num_nodes = ctdb->num_nodes;
197 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
198 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
199 if (!outdata->dptr) {
200 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
204 node_map = (struct ctdb_node_map *)outdata->dptr;
205 node_map->num = num_nodes;
206 for (i=0; i<num_nodes; i++) {
207 if (parse_ip(ctdb->nodes[i]->address.address,
208 NULL, /* TODO: pass in the correct interface here*/
210 &node_map->nodes[i].addr) == 0)
212 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
215 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
216 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
223 get an old style ipv4-only nodemap
226 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
228 uint32_t i, num_nodes;
229 struct ctdb_node_mapv4 *node_map;
231 CHECK_CONTROL_DATA_SIZE(0);
233 num_nodes = ctdb->num_nodes;
235 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
236 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
237 if (!outdata->dptr) {
238 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
242 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
243 node_map->num = num_nodes;
244 for (i=0; i<num_nodes; i++) {
245 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
246 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
250 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
251 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
258 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
259 struct timeval t, void *private_data)
262 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
264 struct ctdb_node **nodes;
266 tmp_ctx = talloc_new(ctdb);
268 /* steal the old nodes file for a while */
269 talloc_steal(tmp_ctx, ctdb->nodes);
272 num_nodes = ctdb->num_nodes;
275 /* load the new nodes file */
276 ctdb_load_nodes_file(ctdb);
278 for (i=0; i<ctdb->num_nodes; i++) {
279 /* keep any identical pre-existing nodes and connections */
280 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
281 talloc_free(ctdb->nodes[i]);
282 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
286 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
290 /* any new or different nodes must be added */
291 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
292 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
293 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
295 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
296 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
297 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
301 /* tell the recovery daemon to reaload the nodes file too */
302 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
304 talloc_free(tmp_ctx);
309 reload the nodes file after a short delay (so that we can send the response
313 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
315 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
321 a traverse function for pulling all relevent records from pulldb
324 struct ctdb_context *ctdb;
325 struct ctdb_marshall_buffer *pulldata;
330 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
332 struct pulldb_data *params = (struct pulldb_data *)p;
333 struct ctdb_rec_data *rec;
335 /* add the record to the blob */
336 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
338 params->failed = true;
341 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
342 if (params->pulldata == NULL) {
343 DEBUG(DEBUG_ERR,(__location__ " Failed to expand pulldb_data to %u (%u records)\n",
344 rec->length + params->len, params->pulldata->count));
345 params->failed = true;
348 params->pulldata->count++;
349 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
350 params->len += rec->length;
357 pul a bunch of records from a ltdb, filtering by lmaster
359 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
361 struct ctdb_control_pulldb *pull;
362 struct ctdb_db_context *ctdb_db;
363 struct pulldb_data params;
364 struct ctdb_marshall_buffer *reply;
366 pull = (struct ctdb_control_pulldb *)indata.dptr;
368 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
370 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
374 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
375 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
379 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
380 CTDB_NO_MEMORY(ctdb, reply);
382 reply->db_id = pull->db_id;
385 params.pulldata = reply;
386 params.len = offsetof(struct ctdb_marshall_buffer, data);
387 params.failed = false;
389 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
390 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
394 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
395 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
396 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
397 talloc_free(params.pulldata);
401 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
403 outdata->dptr = (uint8_t *)params.pulldata;
404 outdata->dsize = params.len;
410 push a bunch of records into a ltdb, filtering by rsn
412 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
414 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
415 struct ctdb_db_context *ctdb_db;
417 struct ctdb_rec_data *rec;
419 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
420 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
424 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
426 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
430 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
431 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
435 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
436 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
440 rec = (struct ctdb_rec_data *)&reply->data[0];
442 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
443 reply->count, reply->db_id));
445 for (i=0;i<reply->count;i++) {
447 struct ctdb_ltdb_header *hdr;
449 key.dptr = &rec->data[0];
450 key.dsize = rec->keylen;
451 data.dptr = &rec->data[key.dsize];
452 data.dsize = rec->datalen;
454 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
455 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
458 hdr = (struct ctdb_ltdb_header *)data.dptr;
459 data.dptr += sizeof(*hdr);
460 data.dsize -= sizeof(*hdr);
462 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
464 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
468 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
471 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
472 reply->count, reply->db_id));
474 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
478 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
483 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
485 uint32_t *dmaster = (uint32_t *)p;
486 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
489 /* skip if already correct */
490 if (header->dmaster == *dmaster) {
494 header->dmaster = *dmaster;
496 ret = tdb_store(tdb, key, data, TDB_REPLACE);
498 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
502 /* TODO: add error checking here */
507 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
509 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
510 struct ctdb_db_context *ctdb_db;
512 ctdb_db = find_ctdb_db(ctdb, p->db_id);
514 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
518 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
519 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
523 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
524 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
528 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
530 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
535 struct ctdb_set_recmode_state {
536 struct ctdb_context *ctdb;
537 struct ctdb_req_control *c;
540 struct timed_event *te;
541 struct fd_event *fde;
543 struct timeval start_time;
547 called if our set_recmode child times out. this would happen if
548 ctdb_recovery_lock() would block.
550 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
551 struct timeval t, void *private_data)
553 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
554 struct ctdb_set_recmode_state);
556 /* we consider this a success, not a failure, as we failed to
557 set the recovery lock which is what we wanted. This can be
558 caused by the cluster filesystem being very slow to
559 arbitrate locks immediately after a node failure.
561 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
562 state->ctdb->recovery_mode = state->recmode;
563 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
568 /* when we free the recmode state we must kill any child process.
570 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
572 double l = timeval_elapsed(&state->start_time);
574 ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
576 if (state->fd[0] != -1) {
579 if (state->fd[1] != -1) {
582 kill(state->child, SIGKILL);
586 /* this is called when the client process has completed ctdb_recovery_lock()
587 and has written data back to us through the pipe.
589 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
590 uint16_t flags, void *private_data)
592 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
593 struct ctdb_set_recmode_state);
597 /* we got a response from our child process so we can abort the
600 talloc_free(state->te);
604 /* read the childs status when trying to lock the reclock file.
605 child wrote 0 if everything is fine and 1 if it did manage
606 to lock the file, which would be a problem since that means
607 we got a request to exit from recovery but we could still lock
608 the file which at this time SHOULD be locked by the recovery
609 daemon on the recmaster
611 ret = read(state->fd[0], &c, 1);
612 if (ret != 1 || c != 0) {
613 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
618 state->ctdb->recovery_mode = state->recmode;
620 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
626 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
627 struct timeval t, void *private_data)
629 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
631 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
632 talloc_free(ctdb->release_ips_ctx);
633 ctdb->release_ips_ctx = NULL;
635 ctdb_release_all_ips(ctdb);
639 set the recovery mode
641 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
642 struct ctdb_req_control *c,
643 TDB_DATA indata, bool *async_reply,
644 const char **errormsg)
646 uint32_t recmode = *(uint32_t *)indata.dptr;
648 struct ctdb_set_recmode_state *state;
649 pid_t parent = getpid();
651 /* if we enter recovery but stay in recovery for too long
652 we will eventually drop all our ip addresses
654 if (recmode == CTDB_RECOVERY_NORMAL) {
655 talloc_free(ctdb->release_ips_ctx);
656 ctdb->release_ips_ctx = NULL;
658 talloc_free(ctdb->release_ips_ctx);
659 ctdb->release_ips_ctx = talloc_new(ctdb);
660 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
662 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
665 if (recmode != ctdb->recovery_mode) {
666 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
667 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
670 if (recmode != CTDB_RECOVERY_NORMAL ||
671 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
672 ctdb->recovery_mode = recmode;
676 /* some special handling when ending recovery mode */
678 /* force the databases to thaw */
679 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
680 if (ctdb->freeze_handles[i] != NULL) {
681 ctdb_control_thaw(ctdb, i);
685 state = talloc(ctdb, struct ctdb_set_recmode_state);
686 CTDB_NO_MEMORY(ctdb, state);
688 state->start_time = timeval_current();
692 if (ctdb->tunable.verify_recovery_lock == 0) {
693 /* dont need to verify the reclock file */
694 ctdb->recovery_mode = recmode;
698 /* For the rest of what needs to be done, we need to do this in
699 a child process since
700 1, the call to ctdb_recovery_lock() can block if the cluster
701 filesystem is in the process of recovery.
703 ret = pipe(state->fd);
706 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
710 state->child = fork();
711 if (state->child == (pid_t)-1) {
718 if (state->child == 0) {
722 /* we should not be able to get the lock on the reclock file,
723 as it should be held by the recovery master
725 if (ctdb_recovery_lock(ctdb, false)) {
726 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
730 write(state->fd[1], &cc, 1);
731 /* make sure we die when our parent dies */
732 while (kill(parent, 0) == 0 || errno != ESRCH) {
734 write(state->fd[1], &cc, 1);
739 set_close_on_exec(state->fd[0]);
743 talloc_set_destructor(state, set_recmode_destructor);
745 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
747 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
748 ctdb_set_recmode_timeout, state);
750 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
751 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
755 if (state->fde == NULL) {
761 state->recmode = recmode;
762 state->c = talloc_steal(state, c);
771 try and get the recovery lock in shared storage - should only work
772 on the recovery master recovery daemon. Anywhere else is a bug
774 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
779 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
781 if (ctdb->recovery_lock_fd != -1) {
782 close(ctdb->recovery_lock_fd);
783 ctdb->recovery_lock_fd = -1;
786 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
787 if (ctdb->recovery_lock_fd == -1) {
788 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
789 ctdb->recovery_lock_file, strerror(errno)));
793 set_close_on_exec(ctdb->recovery_lock_fd);
795 lock.l_type = F_WRLCK;
796 lock.l_whence = SEEK_SET;
801 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
802 close(ctdb->recovery_lock_fd);
803 ctdb->recovery_lock_fd = -1;
805 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
811 close(ctdb->recovery_lock_fd);
812 ctdb->recovery_lock_fd = -1;
816 DEBUG(DEBUG_ERR, ("Recovery lock taken successfully\n"));
819 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
825 delete a record as part of the vacuum process
826 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
827 use non-blocking locks
829 return 0 if the record was successfully deleted (i.e. it does not exist
830 when the function returns)
831 or !0 is the record still exists in the tdb after returning.
833 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
836 struct ctdb_ltdb_header *hdr, *hdr2;
838 /* these are really internal tdb functions - but we need them here for
839 non-blocking lock of the freelist */
840 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
841 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
844 key.dsize = rec->keylen;
845 key.dptr = &rec->data[0];
846 data.dsize = rec->datalen;
847 data.dptr = &rec->data[rec->keylen];
849 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
850 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
854 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
855 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
859 hdr = (struct ctdb_ltdb_header *)data.dptr;
861 /* use a non-blocking lock */
862 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
866 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
867 if (data.dptr == NULL) {
868 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
872 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
873 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
874 tdb_delete(ctdb_db->ltdb->tdb, key);
875 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
876 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
878 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
883 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
885 if (hdr2->rsn > hdr->rsn) {
886 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
887 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
888 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
893 if (hdr2->dmaster == ctdb->pnn) {
894 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
895 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
900 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
901 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
906 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
907 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
908 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
909 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
914 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
915 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
922 struct recovery_callback_state {
923 struct ctdb_req_control *c;
928 called when the 'recovered' event script has finished
930 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
932 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
934 ctdb_enable_monitoring(ctdb);
937 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
938 if (status == -ETIME) {
943 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
946 gettimeofday(&ctdb->last_recovery_finished, NULL);
950 recovery has finished
952 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
953 struct ctdb_req_control *c,
957 struct recovery_callback_state *state;
959 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
961 state = talloc(ctdb, struct recovery_callback_state);
962 CTDB_NO_MEMORY(ctdb, state);
964 state->c = talloc_steal(state, c);
966 ctdb_disable_monitoring(ctdb);
968 ret = ctdb_event_script_callback(ctdb, state,
969 ctdb_end_recovery_callback,
972 CTDB_EVENT_RECOVERED, "%s", "");
975 ctdb_enable_monitoring(ctdb);
977 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
982 /* tell the control that we will be reply asynchronously */
988 called when the 'startrecovery' event script has finished
990 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
992 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
995 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
998 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1003 run the startrecovery eventscript
1005 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
1006 struct ctdb_req_control *c,
1010 struct recovery_callback_state *state;
1012 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
1013 gettimeofday(&ctdb->last_recovery_started, NULL);
1015 state = talloc(ctdb, struct recovery_callback_state);
1016 CTDB_NO_MEMORY(ctdb, state);
1018 state->c = talloc_steal(state, c);
1020 ctdb_disable_monitoring(ctdb);
1022 ret = ctdb_event_script_callback(ctdb, state,
1023 ctdb_start_recovery_callback,
1025 CTDB_EVENT_START_RECOVERY,
1029 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1034 /* tell the control that we will be reply asynchronously */
1035 *async_reply = true;
1040 try to delete all these records as part of the vacuuming process
1041 and return the records we failed to delete
1043 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1045 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1046 struct ctdb_db_context *ctdb_db;
1048 struct ctdb_rec_data *rec;
1049 struct ctdb_marshall_buffer *records;
1051 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1052 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1056 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1058 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1063 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1064 reply->count, reply->db_id));
1067 /* create a blob to send back the records we couldnt delete */
1068 records = (struct ctdb_marshall_buffer *)
1069 talloc_zero_size(outdata,
1070 offsetof(struct ctdb_marshall_buffer, data));
1071 if (records == NULL) {
1072 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1075 records->db_id = ctdb_db->db_id;
1078 rec = (struct ctdb_rec_data *)&reply->data[0];
1079 for (i=0;i<reply->count;i++) {
1082 key.dptr = &rec->data[0];
1083 key.dsize = rec->keylen;
1084 data.dptr = &rec->data[key.dsize];
1085 data.dsize = rec->datalen;
1087 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1088 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1092 /* If we cant delete the record we must add it to the reply
1093 so the lmaster knows it may not purge this record
1095 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1097 struct ctdb_ltdb_header *hdr;
1099 hdr = (struct ctdb_ltdb_header *)data.dptr;
1100 data.dptr += sizeof(*hdr);
1101 data.dsize -= sizeof(*hdr);
1103 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1105 old_size = talloc_get_size(records);
1106 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1107 if (records == NULL) {
1108 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1112 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1115 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1119 outdata->dptr = (uint8_t *)records;
1120 outdata->dsize = talloc_get_size(records);
1128 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1130 uint32_t *capabilities = NULL;
1132 capabilities = talloc(outdata, uint32_t);
1133 CTDB_NO_MEMORY(ctdb, capabilities);
1134 *capabilities = ctdb->capabilities;
1136 outdata->dsize = sizeof(uint32_t);
1137 outdata->dptr = (uint8_t *)capabilities;
1142 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1144 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1145 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1147 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1149 if (*count < ctdb->tunable.recd_ping_failcount) {
1151 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1152 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1153 ctdb_recd_ping_timeout, ctdb);
1157 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Shutting down ctdb daemon. (This can be caused if the cluster filesystem has hung)\n"));
1159 ctdb_stop_recoverd(ctdb);
1160 ctdb_stop_keepalive(ctdb);
1161 ctdb_stop_monitoring(ctdb);
1162 ctdb_release_all_ips(ctdb);
1163 if (ctdb->methods != NULL) {
1164 ctdb->methods->shutdown(ctdb);
1166 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
1167 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Daemon has been shut down.\n"));
1171 /* The recovery daemon will ping us at regular intervals.
1172 If we havent been pinged for a while we assume the recovery
1173 daemon is inoperable and we shut down.
1175 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1177 talloc_free(ctdb->recd_ping_count);
1179 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1180 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1182 if (ctdb->tunable.recd_ping_timeout != 0) {
1183 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1184 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1185 ctdb_recd_ping_timeout, ctdb);
1193 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1195 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1197 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1202 struct stop_node_callback_state {
1203 struct ctdb_req_control *c;
1207 called when the 'stopped' event script has finished
1209 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1211 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1214 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1215 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1216 if (status == -ETIME) {
1217 ctdb_ban_self(ctdb);
1221 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1225 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1228 struct stop_node_callback_state *state;
1230 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1232 state = talloc(ctdb, struct stop_node_callback_state);
1233 CTDB_NO_MEMORY(ctdb, state);
1235 state->c = talloc_steal(state, c);
1237 ctdb_disable_monitoring(ctdb);
1239 ret = ctdb_event_script_callback(ctdb, state,
1240 ctdb_stop_node_callback,
1242 CTDB_EVENT_STOPPED, "%s", "");
1245 ctdb_enable_monitoring(ctdb);
1247 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1252 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1254 *async_reply = true;
1259 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1261 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1262 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;