4 Copyright (C) Andrew Tridgell 2006
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/network.h"
22 #include "system/filesys.h"
23 #include "system/wait.h"
24 #include "system/time.h"
27 /* Allow use of deprecated function tevent_loop_allow_nesting() */
28 #define TEVENT_DEPRECATED
32 #include "lib/tdb_wrap/tdb_wrap.h"
33 #include "lib/util/dlinklist.h"
34 #include "lib/util/debug.h"
35 #include "lib/util/time.h"
36 #include "lib/util/blocking.h"
37 #include "lib/util/become_daemon.h"
40 #include "ctdb_private.h"
41 #include "ctdb_client.h"
43 #include "common/rb_tree.h"
44 #include "common/reqid.h"
45 #include "common/system.h"
46 #include "common/common.h"
47 #include "common/logging.h"
48 #include "common/pidfile.h"
49 #include "common/sock_io.h"
51 struct ctdb_client_pid_list {
52 struct ctdb_client_pid_list *next, *prev;
53 struct ctdb_context *ctdb;
55 struct ctdb_client *client;
58 const char *ctdbd_pidfile = NULL;
59 static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
61 static void daemon_incoming_packet(void *, struct ctdb_req_header *);
63 static pid_t __ctdbd_pid;
65 static void print_exit_message(void)
67 if (getpid() == __ctdbd_pid) {
68 DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
70 /* Wait a second to allow pending log messages to be flushed */
77 struct cpu_check_threshold_data {
78 unsigned short percent;
79 struct timeval timeofday;
80 struct timeval ru_time;
83 static void ctdb_cpu_check_threshold(struct tevent_context *ev,
84 struct tevent_timer *te,
88 struct ctdb_context *ctdb = talloc_get_type_abort(
89 private_data, struct ctdb_context);
90 uint32_t interval = 60;
92 static unsigned short threshold = 0;
93 static struct cpu_check_threshold_data prev = {
95 .timeofday = { .tv_sec = 0 },
96 .ru_time = { .tv_sec = 0 },
100 struct cpu_check_threshold_data curr = {
103 int64_t ru_time_diff, timeofday_diff;
108 * Cache the threshold so that we don't waste time checking
109 * the environment variable every time
111 if (threshold == 0) {
116 t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
121 if (th <= 0 || th > 100) {
122 DBG_WARNING("Failed to parse env var: %s\n", t);
129 ret = getrusage(RUSAGE_SELF, &usage);
131 DBG_WARNING("rusage() failed: %d\n", ret);
135 /* Sum the system and user CPU usage */
136 curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
140 first = timeval_is_zero(&prev.timeofday);
142 /* No previous values recorded so no calculation to do */
146 timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
147 if (timeofday_diff <= 0) {
149 * Time went backwards or didn't progress so no (sane)
150 * calculation can be done
155 ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
157 curr.percent = ru_time_diff * 100 / timeofday_diff;
159 if (curr.percent >= threshold) {
160 /* Log only if the utilisation changes */
161 if (curr.percent != prev.percent) {
162 D_WARNING("WARNING: CPU utilisation %hu%% >= "
163 "threshold (%hu%%)\n",
168 /* Log if the utilisation falls below the threshold */
169 if (prev.percent >= threshold) {
170 D_WARNING("WARNING: CPU utilisation %hu%% < "
171 "threshold (%hu%%)\n",
181 tevent_add_timer(ctdb->ev, ctdb,
182 timeval_current_ofs(interval, 0),
183 ctdb_cpu_check_threshold,
187 static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
189 tevent_add_timer(ctdb->ev, ctdb,
191 ctdb_cpu_check_threshold,
194 #endif /* HAVE_GETRUSAGE */
196 static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
197 struct timeval t, void *private_data)
199 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
201 if (getpid() != ctdb->ctdbd_pid) {
205 tevent_add_timer(ctdb->ev, ctdb,
206 timeval_current_ofs(1, 0),
207 ctdb_time_tick, ctdb);
210 /* Used to trigger a dummy event once per second, to make
211 * detection of hangs more reliable.
213 static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
215 tevent_add_timer(ctdb->ev, ctdb,
216 timeval_current_ofs(1, 0),
217 ctdb_time_tick, ctdb);
220 static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
222 /* start monitoring for connected/disconnected nodes */
223 ctdb_start_keepalive(ctdb);
225 /* start periodic update of tcp tickle lists */
226 ctdb_start_tcp_tickle_update(ctdb);
228 /* start listening for recovery daemon pings */
229 ctdb_control_recd_ping(ctdb);
231 /* start listening to timer ticks */
232 ctdb_start_time_tickd(ctdb);
234 #ifdef HAVE_GETRUSAGE
235 ctdb_start_cpu_check_threshold(ctdb);
236 #endif /* HAVE_GETRUSAGE */
239 static void ignore_signal(int signum)
241 struct sigaction act;
243 memset(&act, 0, sizeof(act));
245 act.sa_handler = SIG_IGN;
246 sigemptyset(&act.sa_mask);
247 sigaddset(&act.sa_mask, signum);
248 sigaction(signum, &act, NULL);
253 send a packet to a client
255 static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
257 CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
258 if (hdr->operation == CTDB_REQ_MESSAGE) {
259 if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
260 DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
265 return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
269 message handler for when we are in daemon mode. This redirects the message
272 static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
275 struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
276 struct ctdb_req_message_old *r;
279 /* construct a message to send to the client containing the data */
280 len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
281 r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
282 len, struct ctdb_req_message_old);
283 CTDB_NO_MEMORY_VOID(client->ctdb, r);
285 talloc_set_name_const(r, "req_message packet");
288 r->datalen = data.dsize;
289 memcpy(&r->data[0], data.dptr, data.dsize);
291 daemon_queue_send(client, &r->hdr);
297 this is called when the ctdb daemon received a ctdb request to
298 set the srvid from the client
300 int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
302 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
304 if (client == NULL) {
305 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
308 res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
311 DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
312 (unsigned long long)srvid));
314 DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
315 (unsigned long long)srvid));
322 this is called when the ctdb daemon received a ctdb request to
323 remove a srvid from the client
325 int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
327 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
328 if (client == NULL) {
329 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
332 return srvid_deregister(ctdb->srv, srvid, client);
335 void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
338 struct ctdb_client *client =
339 talloc_get_type_abort(private_data, struct ctdb_client);
340 struct ctdb_req_tunnel_old *c, *pkt;
343 pkt = (struct ctdb_req_tunnel_old *)data.dptr;
345 len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
346 c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
347 len, struct ctdb_req_tunnel_old);
349 DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
353 talloc_set_name_const(c, "req_tunnel packet");
355 c->tunnel_id = tunnel_id;
356 c->flags = pkt->flags;
357 c->datalen = pkt->datalen;
358 memcpy(c->data, pkt->data, pkt->datalen);
360 daemon_queue_send(client, &c->hdr);
366 destroy a ctdb_client
368 static int ctdb_client_destructor(struct ctdb_client *client)
370 struct ctdb_db_context *ctdb_db;
372 ctdb_takeover_client_destructor_hook(client);
373 reqid_remove(client->ctdb->idr, client->client_id);
374 client->ctdb->num_clients--;
376 if (client->num_persistent_updates != 0) {
377 DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
378 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
380 ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
382 DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
383 "commit active. Forcing recovery.\n"));
384 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
387 * trans3 transaction state:
389 * The destructor sets the pointer to NULL.
391 talloc_free(ctdb_db->persistent_state);
399 this is called when the ctdb daemon received a ctdb request message
400 from a local client over the unix domain socket
402 static void daemon_request_message_from_client(struct ctdb_client *client,
403 struct ctdb_req_message_old *c)
408 if (c->hdr.destnode == CTDB_CURRENT_NODE) {
409 c->hdr.destnode = ctdb_get_pnn(client->ctdb);
412 /* maybe the message is for another client on this node */
413 if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
414 ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
418 /* its for a remote node */
419 data.dptr = &c->data[0];
420 data.dsize = c->datalen;
421 res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
424 DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
430 struct daemon_call_state {
431 struct ctdb_client *client;
433 struct ctdb_call *call;
434 struct timeval start_time;
436 /* readonly request ? */
437 uint32_t readonly_fetch;
438 uint32_t client_callid;
442 complete a call from a client
444 static void daemon_call_from_client_callback(struct ctdb_call_state *state)
446 struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
447 struct daemon_call_state);
448 struct ctdb_reply_call_old *r;
451 struct ctdb_client *client = dstate->client;
452 struct ctdb_db_context *ctdb_db = state->ctdb_db;
454 talloc_steal(client, dstate);
455 talloc_steal(dstate, dstate->call);
457 res = ctdb_daemon_call_recv(state, dstate->call);
459 DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
460 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
462 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
466 length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
467 /* If the client asked for readonly FETCH, we remapped this to
468 FETCH_WITH_HEADER when calling the daemon. So we must
469 strip the extra header off the reply data before passing
470 it back to the client.
472 if (dstate->readonly_fetch
473 && dstate->client_callid == CTDB_FETCH_FUNC) {
474 length -= sizeof(struct ctdb_ltdb_header);
477 r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
478 length, struct ctdb_reply_call_old);
480 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
481 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
482 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
485 r->hdr.reqid = dstate->reqid;
486 r->status = dstate->call->status;
488 if (dstate->readonly_fetch
489 && dstate->client_callid == CTDB_FETCH_FUNC) {
490 /* client only asked for a FETCH so we must strip off
491 the extra ctdb_ltdb header
493 r->datalen = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
494 memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
496 r->datalen = dstate->call->reply_data.dsize;
497 memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
500 res = daemon_queue_send(client, &r->hdr);
502 /* client is dead - return immediately */
506 DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
508 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
509 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
513 struct ctdb_daemon_packet_wrap {
514 struct ctdb_context *ctdb;
519 a wrapper to catch disconnected clients
521 static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
523 struct ctdb_client *client;
524 struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
525 struct ctdb_daemon_packet_wrap);
527 DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
531 client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
532 if (client == NULL) {
533 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
541 daemon_incoming_packet(client, hdr);
544 struct ctdb_deferred_fetch_call {
545 struct ctdb_deferred_fetch_call *next, *prev;
546 struct ctdb_req_call_old *c;
547 struct ctdb_daemon_packet_wrap *w;
550 struct ctdb_deferred_fetch_queue {
551 struct ctdb_deferred_fetch_call *deferred_calls;
554 struct ctdb_deferred_requeue {
555 struct ctdb_deferred_fetch_call *dfc;
556 struct ctdb_client *client;
559 /* called from a timer event and starts reprocessing the deferred call.*/
560 static void reprocess_deferred_call(struct tevent_context *ev,
561 struct tevent_timer *te,
562 struct timeval t, void *private_data)
564 struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
565 struct ctdb_client *client = dfr->client;
567 talloc_steal(client, dfr->dfc->c);
568 daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
572 /* the referral context is destroyed either after a timeout or when the initial
573 fetch-lock has finished.
574 at this stage, immediately start reprocessing the queued up deferred
575 calls so they get reprocessed immediately (and since we are dmaster at
576 this stage, trigger the waiting smbd processes to pick up and aquire the
579 static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
582 /* need to reprocess the packets from the queue explicitely instead of
583 just using a normal destructor since we want, need, to
584 call the clients in the same oder as the requests queued up
586 while (dfq->deferred_calls != NULL) {
587 struct ctdb_client *client;
588 struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
589 struct ctdb_deferred_requeue *dfr;
591 DLIST_REMOVE(dfq->deferred_calls, dfc);
593 client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
594 if (client == NULL) {
595 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
600 /* process it by pushing it back onto the eventloop */
601 dfr = talloc(client, struct ctdb_deferred_requeue);
603 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
607 dfr->dfc = talloc_steal(dfr, dfc);
608 dfr->client = client;
610 tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
611 reprocess_deferred_call, dfr);
617 /* insert the new deferral context into the rb tree.
618 there should never be a pre-existing context here, but check for it
619 warn and destroy the previous context if there is already a deferral context
622 static void *insert_dfq_callback(void *parm, void *data)
625 DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
631 /* if the original fetch-lock did not complete within a reasonable time,
632 free the context and context for all deferred requests to cause them to be
633 re-inserted into the event system.
635 static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
636 struct timeval t, void *private_data)
638 talloc_free(private_data);
641 /* This function is used in the local daemon to register a KEY in a database
643 While the remote fetch is in-flight, any futher attempts to re-fetch the
644 same record will be deferred until the fetch completes.
646 static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
649 struct ctdb_deferred_fetch_queue *dfq;
651 k = ctdb_key_to_idkey(call, call->key);
653 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
657 dfq = talloc(call, struct ctdb_deferred_fetch_queue);
659 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
663 dfq->deferred_calls = NULL;
665 trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
667 talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
669 /* if the fetch havent completed in 30 seconds, just tear it all down
670 and let it try again as the events are reissued */
671 tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
678 /* check if this is a duplicate request to a fetch already in-flight
679 if it is, make this call deferred to be reprocessed later when
680 the in-flight fetch completes.
682 static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
685 struct ctdb_deferred_fetch_queue *dfq;
686 struct ctdb_deferred_fetch_call *dfc;
688 k = ctdb_key_to_idkey(c, key);
690 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
694 dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
703 dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
705 DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
709 dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
710 if (dfc->w == NULL) {
711 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
716 dfc->c = talloc_steal(dfc, c);
717 dfc->w->ctdb = ctdb_db->ctdb;
718 dfc->w->client_id = client->client_id;
720 DLIST_ADD_END(dfq->deferred_calls, dfc);
727 this is called when the ctdb daemon received a ctdb request call
728 from a local client over the unix domain socket
730 static void daemon_request_call_from_client(struct ctdb_client *client,
731 struct ctdb_req_call_old *c)
733 struct ctdb_call_state *state;
734 struct ctdb_db_context *ctdb_db;
735 struct daemon_call_state *dstate;
736 struct ctdb_call *call;
737 struct ctdb_ltdb_header header;
740 struct ctdb_context *ctdb = client->ctdb;
741 struct ctdb_daemon_packet_wrap *w;
743 CTDB_INCREMENT_STAT(ctdb, total_calls);
744 CTDB_INCREMENT_STAT(ctdb, pending_calls);
746 ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
748 DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x",
750 CTDB_DECREMENT_STAT(ctdb, pending_calls);
754 if (ctdb_db->unhealthy_reason) {
756 * this is just a warning, as the tdb should be empty anyway,
757 * and only persistent databases can be unhealthy, which doesn't
758 * use this code patch
760 DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
761 ctdb_db->db_name, ctdb_db->unhealthy_reason));
765 key.dsize = c->keylen;
767 w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
768 CTDB_NO_MEMORY_VOID(ctdb, w);
771 w->client_id = client->client_id;
773 ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
774 (struct ctdb_req_header *)c, &data,
775 daemon_incoming_packet_wrap, w, true);
777 /* will retry later */
778 CTDB_DECREMENT_STAT(ctdb, pending_calls);
785 DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
786 CTDB_DECREMENT_STAT(ctdb, pending_calls);
791 /* check if this fetch request is a duplicate for a
792 request we already have in flight. If so defer it until
793 the first request completes.
795 if (ctdb->tunable.fetch_collapse == 1) {
796 if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
797 ret = ctdb_ltdb_unlock(ctdb_db, key);
799 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
801 CTDB_DECREMENT_STAT(ctdb, pending_calls);
802 talloc_free(data.dptr);
807 /* Dont do READONLY if we don't have a tracking database */
808 if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
809 c->flags &= ~CTDB_WANT_READONLY;
812 if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
813 header.flags &= ~CTDB_REC_RO_FLAGS;
814 CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
815 CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
816 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
817 ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
819 /* and clear out the tracking data */
820 if (tdb_delete(ctdb_db->rottdb, key) != 0) {
821 DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
825 /* if we are revoking, we must defer all other calls until the revoke
828 if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
829 talloc_free(data.dptr);
830 ret = ctdb_ltdb_unlock(ctdb_db, key);
832 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
833 ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
835 CTDB_DECREMENT_STAT(ctdb, pending_calls);
839 if ((header.dmaster == ctdb->pnn)
840 && (!(c->flags & CTDB_WANT_READONLY))
841 && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
842 header.flags |= CTDB_REC_RO_REVOKING_READONLY;
843 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
844 ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
846 ret = ctdb_ltdb_unlock(ctdb_db, key);
848 if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
849 ctdb_fatal(ctdb, "Failed to start record revoke");
851 talloc_free(data.dptr);
853 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
854 ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
857 CTDB_DECREMENT_STAT(ctdb, pending_calls);
861 dstate = talloc(client, struct daemon_call_state);
862 if (dstate == NULL) {
863 ret = ctdb_ltdb_unlock(ctdb_db, key);
865 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
868 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
869 CTDB_DECREMENT_STAT(ctdb, pending_calls);
872 dstate->start_time = timeval_current();
873 dstate->client = client;
874 dstate->reqid = c->hdr.reqid;
875 talloc_steal(dstate, data.dptr);
877 call = dstate->call = talloc_zero(dstate, struct ctdb_call);
879 ret = ctdb_ltdb_unlock(ctdb_db, key);
881 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
884 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
885 CTDB_DECREMENT_STAT(ctdb, pending_calls);
886 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
890 dstate->readonly_fetch = 0;
891 call->call_id = c->callid;
893 call->call_data.dptr = c->data + c->keylen;
894 call->call_data.dsize = c->calldatalen;
895 call->flags = c->flags;
897 if (c->flags & CTDB_WANT_READONLY) {
898 /* client wants readonly record, so translate this into a
899 fetch with header. remember what the client asked for
900 so we can remap the reply back to the proper format for
901 the client in the reply
903 dstate->client_callid = call->call_id;
904 call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
905 dstate->readonly_fetch = 1;
908 if (header.dmaster == ctdb->pnn) {
909 state = ctdb_call_local_send(ctdb_db, call, &header, &data);
911 state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
912 if (ctdb->tunable.fetch_collapse == 1) {
913 /* This request triggered a remote fetch-lock.
914 set up a deferral for this key so any additional
915 fetch-locks are deferred until the current one
918 setup_deferred_fetch_locks(ctdb_db, call);
922 ret = ctdb_ltdb_unlock(ctdb_db, key);
924 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
928 DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
929 CTDB_DECREMENT_STAT(ctdb, pending_calls);
930 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
933 talloc_steal(state, dstate);
934 talloc_steal(client, state);
936 state->async.fn = daemon_call_from_client_callback;
937 state->async.private_data = dstate;
941 static void daemon_request_control_from_client(struct ctdb_client *client,
942 struct ctdb_req_control_old *c);
943 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
944 struct ctdb_req_tunnel_old *c);
946 /* data contains a packet from the client */
947 static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
949 struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
951 struct ctdb_context *ctdb = client->ctdb;
953 /* place the packet as a child of a tmp_ctx. We then use
954 talloc_free() below to free it. If any of the calls want
955 to keep it, then they will steal it somewhere else, and the
956 talloc_free() will be a no-op */
957 tmp_ctx = talloc_new(client);
958 talloc_steal(tmp_ctx, hdr);
960 if (hdr->ctdb_magic != CTDB_MAGIC) {
961 ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
965 if (hdr->ctdb_version != CTDB_PROTOCOL) {
966 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
970 switch (hdr->operation) {
972 CTDB_INCREMENT_STAT(ctdb, client.req_call);
973 daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
976 case CTDB_REQ_MESSAGE:
977 CTDB_INCREMENT_STAT(ctdb, client.req_message);
978 daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
981 case CTDB_REQ_CONTROL:
982 CTDB_INCREMENT_STAT(ctdb, client.req_control);
983 daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
986 case CTDB_REQ_TUNNEL:
987 CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
988 daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
992 DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
997 talloc_free(tmp_ctx);
1001 called when the daemon gets a incoming packet
1003 static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
1005 struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
1006 struct ctdb_req_header *hdr;
1009 talloc_free(client);
1013 CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
1015 if (cnt < sizeof(*hdr)) {
1016 ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
1020 hdr = (struct ctdb_req_header *)data;
1022 if (hdr->ctdb_magic != CTDB_MAGIC) {
1023 ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
1027 if (hdr->ctdb_version != CTDB_PROTOCOL) {
1028 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
1032 DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
1033 "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
1034 hdr->srcnode, hdr->destnode));
1036 /* it is the responsibility of the incoming packet function to free 'data' */
1037 daemon_incoming_packet(client, hdr);
1045 static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
1047 if (client_pid->ctdb->client_pids != NULL) {
1048 DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
1054 static int get_new_client_id(struct reqid_context *idr,
1055 struct ctdb_client *client,
1060 client_id = reqid_new(idr, client);
1062 * Some places in the code (e.g. ctdb_control_db_attach(),
1063 * ctdb_control_db_detach()) assign a special meaning to
1064 * client_id 0. The assumption is that if client_id is 0 then
1065 * the control has come from another daemon. Therefore, we
1066 * should never return client_id == 0.
1068 if (client_id == 0) {
1070 * Don't leak ID 0. This is safe because the ID keeps
1071 * increasing. A test will be added to ensure that
1072 * this doesn't change.
1074 reqid_remove(idr, 0);
1076 client_id = reqid_new(idr, client);
1079 if (client_id == REQID_INVALID) {
1083 if (client_id == 0) {
1084 /* Every other ID must have been used and we can't use 0 */
1085 reqid_remove(idr, 0);
1093 static void ctdb_accept_client(struct tevent_context *ev,
1094 struct tevent_fd *fde, uint16_t flags,
1097 struct sockaddr_un addr;
1100 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1101 struct ctdb_client *client;
1102 struct ctdb_client_pid_list *client_pid;
1106 memset(&addr, 0, sizeof(addr));
1108 fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
1112 smb_set_close_on_exec(fd);
1114 ret = set_blocking(fd, false);
1118 " failed to set socket non-blocking (%s)\n",
1124 set_close_on_exec(fd);
1126 DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
1128 client = talloc_zero(ctdb, struct ctdb_client);
1129 if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
1130 DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
1133 client->ctdb = ctdb;
1136 ret = get_new_client_id(ctdb->idr, client, &client->client_id);
1138 DBG_ERR("Unable to get client ID (%d)\n", ret);
1140 talloc_free(client);
1144 client->pid = peer_pid;
1146 client_pid = talloc(client, struct ctdb_client_pid_list);
1147 if (client_pid == NULL) {
1148 DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
1150 talloc_free(client);
1153 client_pid->ctdb = ctdb;
1154 client_pid->pid = peer_pid;
1155 client_pid->client = client;
1157 DLIST_ADD(ctdb->client_pids, client_pid);
1159 client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
1160 ctdb_daemon_read_cb, client,
1161 "client-%u", client->pid);
1163 talloc_set_destructor(client, ctdb_client_destructor);
1164 talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
1165 ctdb->num_clients++;
1171 create a unix domain socket and bind it
1172 return a file descriptor open on the socket
1174 static int ux_socket_bind(struct ctdb_context *ctdb)
1176 struct sockaddr_un addr;
1179 ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
1180 if (ctdb->daemon.sd == -1) {
1184 memset(&addr, 0, sizeof(addr));
1185 addr.sun_family = AF_UNIX;
1186 strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
1188 if (! sock_clean(ctdb->daemon.name)) {
1192 set_close_on_exec(ctdb->daemon.sd);
1194 ret = set_blocking(ctdb->daemon.sd, false);
1198 " failed to set socket non-blocking (%s)\n",
1203 if (bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
1204 DEBUG(DEBUG_CRIT,("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name));
1208 if (chown(ctdb->daemon.name, geteuid(), getegid()) != 0 ||
1209 chmod(ctdb->daemon.name, 0700) != 0) {
1210 DEBUG(DEBUG_CRIT,("Unable to secure ctdb socket '%s', ctdb->daemon.name\n", ctdb->daemon.name));
1215 if (listen(ctdb->daemon.sd, 100) != 0) {
1216 DEBUG(DEBUG_CRIT,("Unable to listen on ctdb socket '%s'\n", ctdb->daemon.name));
1220 DEBUG(DEBUG_NOTICE, ("Listening to ctdb socket %s\n",
1221 ctdb->daemon.name));
1225 close(ctdb->daemon.sd);
1226 ctdb->daemon.sd = -1;
1230 static void initialise_node_flags (struct ctdb_context *ctdb)
1234 /* Always found: PNN correctly set just before this is called */
1235 for (i = 0; i < ctdb->num_nodes; i++) {
1236 if (ctdb->pnn == ctdb->nodes[i]->pnn) {
1241 ctdb->nodes[i]->flags &= ~NODE_FLAGS_DISCONNECTED;
1243 /* do we start out in DISABLED mode? */
1244 if (ctdb->start_as_disabled != 0) {
1245 D_ERR("This node is configured to start in DISABLED state\n");
1246 ctdb->nodes[i]->flags |= NODE_FLAGS_DISABLED;
1248 /* do we start out in STOPPED mode? */
1249 if (ctdb->start_as_stopped != 0) {
1250 D_ERR("This node is configured to start in STOPPED state\n");
1251 ctdb->nodes[i]->flags |= NODE_FLAGS_STOPPED;
1255 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
1259 ctdb_die(ctdb, "Failed to run setup event");
1261 ctdb_run_notification_script(ctdb, "setup");
1263 /* Start the recovery daemon */
1264 if (ctdb_start_recoverd(ctdb) != 0) {
1265 DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
1269 ctdb_start_periodic_events(ctdb);
1271 ctdb_wait_for_first_recovery(ctdb);
1274 static struct timeval tevent_before_wait_ts;
1275 static struct timeval tevent_after_wait_ts;
1277 static void ctdb_tevent_trace_init(void)
1281 now = timeval_current();
1283 tevent_before_wait_ts = now;
1284 tevent_after_wait_ts = now;
1287 static void ctdb_tevent_trace(enum tevent_trace_point tp,
1290 struct timeval diff;
1292 struct ctdb_context *ctdb =
1293 talloc_get_type(private_data, struct ctdb_context);
1295 if (getpid() != ctdb->ctdbd_pid) {
1299 now = timeval_current();
1302 case TEVENT_TRACE_BEFORE_WAIT:
1303 diff = timeval_until(&tevent_after_wait_ts, &now);
1304 if (diff.tv_sec > 3) {
1306 ("Handling event took %ld seconds!\n",
1307 (long)diff.tv_sec));
1309 tevent_before_wait_ts = now;
1312 case TEVENT_TRACE_AFTER_WAIT:
1313 diff = timeval_until(&tevent_before_wait_ts, &now);
1314 if (diff.tv_sec > 3) {
1316 ("No event for %ld seconds!\n",
1317 (long)diff.tv_sec));
1319 tevent_after_wait_ts = now;
1323 /* Do nothing for future tevent trace points */ ;
1327 static void ctdb_remove_pidfile(void)
1329 TALLOC_FREE(ctdbd_pidfile_ctx);
1332 static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
1334 if (ctdbd_pidfile != NULL) {
1335 int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
1336 &ctdbd_pidfile_ctx);
1339 ("Failed to create PID file %s\n",
1344 DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
1345 atexit(ctdb_remove_pidfile);
1349 static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
1351 unsigned int i, j, count;
1353 /* initialize the vnn mapping table, skipping any deleted nodes */
1354 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
1355 CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
1358 for (i = 0; i < ctdb->num_nodes; i++) {
1359 if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
1364 ctdb->vnn_map->generation = INVALID_GENERATION;
1365 ctdb->vnn_map->size = count;
1366 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
1367 CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
1369 for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
1370 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1373 ctdb->vnn_map->map[j] = i;
1378 static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
1380 if (ctdb->address == NULL) {
1382 "Can not determine PNN - node address is not set\n");
1385 ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
1386 if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
1388 "Can not determine PNN - unknown node address\n");
1391 D_NOTICE("PNN is %u\n", ctdb->pnn);
1395 start the protocol going as a daemon
1397 int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork)
1400 struct tevent_fd *fde;
1402 become_daemon(do_fork, !do_fork, false);
1404 ignore_signal(SIGPIPE);
1405 ignore_signal(SIGUSR1);
1407 ctdb->ctdbd_pid = getpid();
1408 DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
1409 SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
1410 ctdb_create_pidfile(ctdb);
1412 /* create a unix domain stream socket to listen to */
1413 res = ux_socket_bind(ctdb);
1415 DEBUG(DEBUG_ALERT,("Cannot continue. Exiting!\n"));
1419 /* Make sure we log something when the daemon terminates.
1420 * This must be the first exit handler to run (so the last to
1423 __ctdbd_pid = getpid();
1424 atexit(print_exit_message);
1426 if (ctdb->do_setsched) {
1427 /* try to set us up as realtime */
1428 if (!set_scheduler()) {
1431 DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
1434 ctdb->ev = tevent_context_init(NULL);
1435 if (ctdb->ev == NULL) {
1436 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
1439 tevent_loop_allow_nesting(ctdb->ev);
1440 ctdb_tevent_trace_init();
1441 tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
1443 /* set up a handler to pick up sigchld */
1444 if (ctdb_init_sigchld(ctdb) == NULL) {
1445 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
1450 ctdb_set_child_logging(ctdb);
1453 TALLOC_FREE(ctdb->srv);
1454 if (srvid_init(ctdb, &ctdb->srv) != 0) {
1455 DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
1459 TALLOC_FREE(ctdb->tunnels);
1460 if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
1461 DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
1465 /* initialize statistics collection */
1466 ctdb_statistics_init(ctdb);
1468 /* force initial recovery for election */
1469 ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
1471 if (ctdb_start_eventd(ctdb) != 0) {
1472 DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
1476 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
1477 ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
1479 ctdb_die(ctdb, "Failed to run init event\n");
1481 ctdb_run_notification_script(ctdb, "init");
1483 if (strcmp(ctdb->transport, "tcp") == 0) {
1484 ret = ctdb_tcp_init(ctdb);
1486 #ifdef USE_INFINIBAND
1487 if (strcmp(ctdb->transport, "ib") == 0) {
1488 ret = ctdb_ibw_init(ctdb);
1492 DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
1496 if (ctdb->methods == NULL) {
1497 DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
1498 ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
1501 /* Initialise the transport. This sets the node address if it
1502 * was not set via the command-line. */
1503 if (ctdb->methods->initialise(ctdb) != 0) {
1504 ctdb_fatal(ctdb, "transport failed to initialise");
1507 ctdb_set_my_pnn(ctdb);
1509 initialise_node_flags(ctdb);
1511 ret = ctdb_set_public_addresses(ctdb, true);
1513 D_ERR("Unable to setup public IP addresses\n");
1517 ctdb_initialise_vnn_map(ctdb);
1519 /* attach to existing databases */
1520 if (ctdb_attach_databases(ctdb) != 0) {
1521 ctdb_fatal(ctdb, "Failed to attach to databases\n");
1524 /* start frozen, then let the first election sort things out */
1525 if (!ctdb_blocking_freeze(ctdb)) {
1526 ctdb_fatal(ctdb, "Failed to get initial freeze\n");
1529 /* now start accepting clients, only can do this once frozen */
1530 fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
1531 ctdb_accept_client, ctdb);
1533 ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
1535 tevent_fd_set_auto_close(fde);
1537 /* Start the transport */
1538 if (ctdb->methods->start(ctdb) != 0) {
1539 DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
1540 ctdb_fatal(ctdb, "transport failed to start");
1543 /* Recovery daemon and timed events are started from the
1544 * callback, only after the setup event completes
1547 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
1548 ret = ctdb_event_script_callback(ctdb,
1550 ctdb_setup_event_callback,
1556 DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
1560 lockdown_memory(ctdb->valgrinding);
1562 /* go into a wait loop to allow other nodes to complete */
1563 tevent_loop_wait(ctdb->ev);
1565 DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
1570 allocate a packet for use in daemon<->daemon communication
1572 struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
1573 TALLOC_CTX *mem_ctx,
1574 enum ctdb_operation operation,
1575 size_t length, size_t slength,
1579 struct ctdb_req_header *hdr;
1581 length = MAX(length, slength);
1582 size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
1584 if (ctdb->methods == NULL) {
1585 DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
1586 operation, (unsigned)length));
1590 hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
1592 DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
1593 operation, (unsigned)length));
1596 talloc_set_name_const(hdr, type);
1597 memset(hdr, 0, slength);
1598 hdr->length = length;
1599 hdr->operation = operation;
1600 hdr->ctdb_magic = CTDB_MAGIC;
1601 hdr->ctdb_version = CTDB_PROTOCOL;
1602 hdr->generation = ctdb->vnn_map->generation;
1603 hdr->srcnode = ctdb->pnn;
1608 struct daemon_control_state {
1609 struct daemon_control_state *next, *prev;
1610 struct ctdb_client *client;
1611 struct ctdb_req_control_old *c;
1613 struct ctdb_node *node;
1617 callback when a control reply comes in
1619 static void daemon_control_callback(struct ctdb_context *ctdb,
1620 int32_t status, TDB_DATA data,
1621 const char *errormsg,
1624 struct daemon_control_state *state = talloc_get_type(private_data,
1625 struct daemon_control_state);
1626 struct ctdb_client *client = state->client;
1627 struct ctdb_reply_control_old *r;
1631 /* construct a message to send to the client containing the data */
1632 len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
1634 len += strlen(errormsg);
1636 r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
1637 struct ctdb_reply_control_old);
1638 CTDB_NO_MEMORY_VOID(ctdb, r);
1640 r->hdr.reqid = state->reqid;
1642 r->datalen = data.dsize;
1644 memcpy(&r->data[0], data.dptr, data.dsize);
1646 r->errorlen = strlen(errormsg);
1647 memcpy(&r->data[r->datalen], errormsg, r->errorlen);
1650 ret = daemon_queue_send(client, &r->hdr);
1657 fail all pending controls to a disconnected node
1659 void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
1661 struct daemon_control_state *state;
1662 while ((state = node->pending_controls)) {
1663 DLIST_REMOVE(node->pending_controls, state);
1664 daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
1665 "node is disconnected", state);
1670 destroy a daemon_control_state
1672 static int daemon_control_destructor(struct daemon_control_state *state)
1675 DLIST_REMOVE(state->node->pending_controls, state);
1681 this is called when the ctdb daemon received a ctdb request control
1682 from a local client over the unix domain socket
1684 static void daemon_request_control_from_client(struct ctdb_client *client,
1685 struct ctdb_req_control_old *c)
1689 struct daemon_control_state *state;
1690 TALLOC_CTX *tmp_ctx = talloc_new(client);
1692 if (c->hdr.destnode == CTDB_CURRENT_NODE) {
1693 c->hdr.destnode = client->ctdb->pnn;
1696 state = talloc(client, struct daemon_control_state);
1697 CTDB_NO_MEMORY_VOID(client->ctdb, state);
1699 state->client = client;
1700 state->c = talloc_steal(state, c);
1701 state->reqid = c->hdr.reqid;
1702 if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1703 state->node = client->ctdb->nodes[c->hdr.destnode];
1704 DLIST_ADD(state->node->pending_controls, state);
1709 talloc_set_destructor(state, daemon_control_destructor);
1711 if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
1712 talloc_steal(tmp_ctx, state);
1715 data.dptr = &c->data[0];
1716 data.dsize = c->datalen;
1717 res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
1718 c->srvid, c->opcode, client->client_id,
1720 data, daemon_control_callback,
1723 DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
1727 talloc_free(tmp_ctx);
1730 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
1731 struct ctdb_req_tunnel_old *c)
1736 if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1737 DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
1742 ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
1745 ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
1751 .dsize = c->datalen,
1752 .dptr = &c->data[0],
1755 ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
1756 c->tunnel_id, c->flags, data);
1758 DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
1764 register a call function
1766 int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
1767 ctdb_fn_t fn, int id)
1769 struct ctdb_registered_call *call;
1770 struct ctdb_db_context *ctdb_db;
1772 ctdb_db = find_ctdb_db(ctdb, db_id);
1773 if (ctdb_db == NULL) {
1777 call = talloc(ctdb_db, struct ctdb_registered_call);
1781 DLIST_ADD(ctdb_db->calls, call);
1788 this local messaging handler is ugly, but is needed to prevent
1789 recursion in ctdb_send_message() when the destination node is the
1790 same as the source node
1792 struct ctdb_local_message {
1793 struct ctdb_context *ctdb;
1798 static void ctdb_local_message_trigger(struct tevent_context *ev,
1799 struct tevent_timer *te,
1800 struct timeval t, void *private_data)
1802 struct ctdb_local_message *m = talloc_get_type(
1803 private_data, struct ctdb_local_message);
1805 srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
1809 static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
1811 struct ctdb_local_message *m;
1812 m = talloc(ctdb, struct ctdb_local_message);
1813 CTDB_NO_MEMORY(ctdb, m);
1818 m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
1819 if (m->data.dptr == NULL) {
1824 /* this needs to be done as an event to prevent recursion */
1825 tevent_add_timer(ctdb->ev, m, timeval_zero(),
1826 ctdb_local_message_trigger, m);
1833 int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
1834 uint64_t srvid, TDB_DATA data)
1836 struct ctdb_req_message_old *r;
1839 if (ctdb->methods == NULL) {
1840 DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
1844 /* see if this is a message to ourselves */
1845 if (pnn == ctdb->pnn) {
1846 return ctdb_local_message(ctdb, srvid, data);
1849 len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
1850 r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
1851 struct ctdb_req_message_old);
1852 CTDB_NO_MEMORY(ctdb, r);
1854 r->hdr.destnode = pnn;
1856 r->datalen = data.dsize;
1857 memcpy(&r->data[0], data.dptr, data.dsize);
1859 ctdb_queue_packet(ctdb, &r->hdr);
1867 struct ctdb_client_notify_list {
1868 struct ctdb_client_notify_list *next, *prev;
1869 struct ctdb_context *ctdb;
1875 static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
1879 DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
1881 ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
1883 DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
1889 int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
1891 struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
1892 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
1893 struct ctdb_client_notify_list *nl;
1895 DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
1897 if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
1898 DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
1902 if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
1903 DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
1908 if (client == NULL) {
1909 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
1913 for(nl=client->notify; nl; nl=nl->next) {
1914 if (nl->srvid == notify->srvid) {
1919 DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
1923 nl = talloc(client, struct ctdb_client_notify_list);
1924 CTDB_NO_MEMORY(ctdb, nl);
1926 nl->srvid = notify->srvid;
1927 nl->data.dsize = notify->len;
1928 nl->data.dptr = talloc_memdup(nl, notify->notify_data,
1930 CTDB_NO_MEMORY(ctdb, nl->data.dptr);
1932 DLIST_ADD(client->notify, nl);
1933 talloc_set_destructor(nl, ctdb_client_notify_destructor);
1938 int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
1940 uint64_t srvid = *(uint64_t *)indata.dptr;
1941 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
1942 struct ctdb_client_notify_list *nl;
1944 DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
1946 if (client == NULL) {
1947 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
1951 for(nl=client->notify; nl; nl=nl->next) {
1952 if (nl->srvid == srvid) {
1957 DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
1961 DLIST_REMOVE(client->notify, nl);
1962 talloc_set_destructor(nl, NULL);
1968 struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
1970 struct ctdb_client_pid_list *client_pid;
1972 for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
1973 if (client_pid->pid == pid) {
1974 return client_pid->client;
1981 /* This control is used by samba when probing if a process (of a samba daemon)
1983 Samba does this when it needs/wants to check if a subrecord in one of the
1984 databases is still valid, or if it is stale and can be removed.
1985 If the node is in unhealthy or stopped state we just kill of the samba
1986 process holding this sub-record and return to the calling samba that
1987 the process does not exist.
1988 This allows us to forcefully recall subrecords registered by samba processes
1989 on banned and stopped nodes.
1991 int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
1993 struct ctdb_client *client;
1995 client = ctdb_find_client_by_pid(ctdb, pid);
1996 if (client == NULL) {
2000 if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
2002 ("Killing client with pid:%d on banned/stopped node\n",
2004 talloc_free(client);
2008 return kill(pid, 0);
2011 int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
2014 struct ctdb_client_pid_list *client_pid;
2019 pid = *(pid_t *)indata.dptr;
2020 srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
2022 for (client_pid = ctdb->client_pids;
2024 client_pid = client_pid->next) {
2025 if (client_pid->pid == pid) {
2026 ret = srvid_exists(ctdb->srv, srvid,
2027 client_pid->client);
2037 int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
2039 struct ctdb_node_map_old *node_map = NULL;
2041 CHECK_CONTROL_DATA_SIZE(0);
2043 node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
2044 if (node_map == NULL) {
2045 DEBUG(DEBUG_ERR, ("Failed to read nodes file\n"));
2049 outdata->dptr = (unsigned char *)node_map;
2050 outdata->dsize = talloc_get_size(outdata->dptr);
2055 void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
2057 if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
2058 DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
2062 DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
2063 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
2064 ctdb_stop_recoverd(ctdb);
2065 ctdb_stop_keepalive(ctdb);
2066 ctdb_stop_monitoring(ctdb);
2067 ctdb_release_all_ips(ctdb);
2068 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
2069 ctdb_stop_eventd(ctdb);
2070 if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
2071 ctdb->methods->shutdown(ctdb);
2074 DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
2078 /* When forking the main daemon and the child process needs to connect
2079 * back to the daemon as a client process, this function can be used
2080 * to change the ctdb context from daemon into client mode. The child
2081 * process must be created using ctdb_fork() and not fork() -
2082 * ctdb_fork() does some necessary housekeeping.
2084 int switch_from_server_to_client(struct ctdb_context *ctdb)
2088 /* get a new event context */
2089 ctdb->ev = tevent_context_init(ctdb);
2090 if (ctdb->ev == NULL) {
2091 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
2094 tevent_loop_allow_nesting(ctdb->ev);
2096 /* Connect to main CTDB daemon */
2097 ret = ctdb_socket_connect(ctdb);
2099 DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
2103 ctdb->can_send_controls = true;