4 Copyright (C) Rusty Russell 2010
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
25 #include <sys/socket.h>
27 #include "libctdb_private.h"
29 #include "local_tdb.h"
31 #include <dlinklist.h>
32 #include <ctdb_protocol.h>
34 /* Remove type-safety macros. */
35 #undef ctdb_attachdb_send
36 #undef ctdb_readrecordlock_async
40 struct ctdb_lock *next, *prev;
42 struct ctdb_db *ctdb_db;
45 /* This will always be set by the time user sees this. */
46 unsigned long held_magic;
47 struct ctdb_ltdb_header *hdr;
49 /* For convenience, we stash original callback here. */
50 ctdb_rrl_callback_t callback;
54 struct ctdb_connection *ctdb;
58 struct tdb_context *tdb;
60 ctdb_callback_t callback;
64 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
66 DLIST_REMOVE(ctdb->locks, lock);
69 /* FIXME: for thread safety, need tid info too. */
70 static bool holding_lock(struct ctdb_connection *ctdb)
72 /* For the moment, you can't ever hold more than 1 lock. */
73 return (ctdb->locks != NULL);
76 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
78 DLIST_ADD(ctdb->locks, lock);
81 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
83 struct ctdb_lock *i, *next;
85 for (i = ctdb->locks; i; i = next) {
86 /* Grab next pointer, as release_lock will free i */
88 if (i->ctdb_db == db) {
89 ctdb_release_lock(db, i);
94 /* FIXME: Could be in shared util code with rest of ctdb */
95 static void close_noerr(int fd)
102 /* FIXME: Could be in shared util code with rest of ctdb */
103 static void free_noerr(void *p)
110 /* FIXME: Could be in shared util code with rest of ctdb */
111 static void set_nonblocking(int fd)
114 v = fcntl(fd, F_GETFL, 0);
115 fcntl(fd, F_SETFL, v | O_NONBLOCK);
118 /* FIXME: Could be in shared util code with rest of ctdb */
119 static void set_close_on_exec(int fd)
122 v = fcntl(fd, F_GETFD, 0);
123 fcntl(fd, F_SETFD, v | FD_CLOEXEC);
126 static void set_pnn(struct ctdb_connection *ctdb,
127 struct ctdb_request *req,
130 if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
131 DEBUG(ctdb, LOG_CRIT,
132 "ctdb_connect(async): failed to get pnn");
135 ctdb_request_free(ctdb, req);
138 struct ctdb_connection *ctdb_connect(const char *addr,
139 ctdb_log_fn_t log_fn, void *log_priv)
141 struct ctdb_connection *ctdb;
142 struct sockaddr_un sun;
144 ctdb = malloc(sizeof(*ctdb));
146 /* With no format string, we hope it doesn't use ap! */
148 memset(&ap, 0, sizeof(ap));
150 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
156 ctdb->message_handlers = NULL;
158 ctdb->broken = false;
160 ctdb->log_priv = log_priv;
163 memset(&sun, 0, sizeof(sun));
164 sun.sun_family = AF_UNIX;
167 strncpy(sun.sun_path, addr, sizeof(sun.sun_path));
168 ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
172 set_nonblocking(ctdb->fd);
173 set_close_on_exec(ctdb->fd);
175 if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
178 /* Immediately queue a request to get our pnn. */
179 if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
185 close_noerr(ctdb->fd);
192 void ctdb_disconnect(struct ctdb_connection *ctdb)
194 struct ctdb_request *i;
196 DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
198 while ((i = ctdb->outq) != NULL) {
199 DLIST_REMOVE(ctdb->outq, i);
200 ctdb_request_free(ctdb, i);
203 while ((i = ctdb->doneq) != NULL) {
204 DLIST_REMOVE(ctdb->doneq, i);
205 ctdb_request_free(ctdb, i);
209 free_io_elem(ctdb->in);
211 remove_message_handlers(ctdb);
214 /* Just in case they try to reuse */
219 int ctdb_get_fd(struct ctdb_connection *ctdb)
224 int ctdb_which_events(struct ctdb_connection *ctdb)
233 struct ctdb_request *new_ctdb_request(size_t len,
234 ctdb_callback_t cb, void *cbdata)
236 struct ctdb_request *req = malloc(sizeof(*req));
239 req->io = new_io_elem(len);
244 req->hdr.hdr = io_elem_data(req->io, NULL);
247 req->priv_data = cbdata;
249 req->extra_destructor = NULL;
253 void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
255 if (req->next || req->prev) {
256 DEBUG(ctdb, LOG_ALERT,
257 "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
258 req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
259 ctdb_cancel(ctdb, req);
262 if (req->extra_destructor) {
263 req->extra_destructor(ctdb, req);
266 free_io_elem(req->reply);
268 free_io_elem(req->io);
272 /* Sanity-checking wrapper for reply. */
273 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
274 struct ctdb_request *req,
278 struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
280 /* Library user error if this isn't a reply to a call. */
281 if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
283 DEBUG(ctdb, LOG_ALERT,
284 "This was not a ctdbd call request: operation %u",
285 req->hdr.hdr->operation);
289 if (req->hdr.call->callid != callid) {
291 DEBUG(ctdb, LOG_ALERT,
292 "This was not a ctdbd %u call request: %u",
293 callid, req->hdr.call->callid);
297 /* ctdbd or our error if this isn't a reply call. */
298 if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
300 DEBUG(ctdb, LOG_CRIT,
301 "Invalid ctdbd call reply: len %zu, operation %u",
302 len, inhdr->hdr.operation);
309 /* Sanity-checking wrapper for reply. */
310 struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
311 struct ctdb_request *req,
312 enum ctdb_controls control)
315 struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
317 /* Library user error if this isn't a reply to a call. */
318 if (len < sizeof(*inhdr)) {
320 DEBUG(ctdb, LOG_ALERT,
321 "Short ctdbd control reply: %zu bytes", len);
324 if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
326 DEBUG(ctdb, LOG_ALERT,
327 "This was not a ctdbd control request: operation %u",
328 req->hdr.hdr->operation);
332 /* ... or if it was a different control from what we expected. */
333 if (req->hdr.control->opcode != control) {
335 DEBUG(ctdb, LOG_ALERT,
336 "This was not an opcode %u ctdbd control request: %u",
337 control, req->hdr.control->opcode);
341 /* ctdbd or our error if this isn't a reply call. */
342 if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
344 DEBUG(ctdb, LOG_CRIT,
345 "Invalid ctdbd control reply: operation %u",
346 inhdr->hdr.operation);
353 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
355 struct ctdb_req_header *hdr;
357 struct ctdb_request *i;
359 hdr = io_elem_data(in, &len);
360 /* FIXME: use len to check packet! */
362 if (hdr->operation == CTDB_REQ_MESSAGE) {
363 deliver_message(ctdb, hdr);
367 for (i = ctdb->doneq; i; i = i->next) {
368 if (i->hdr.hdr->reqid == hdr->reqid) {
369 DLIST_REMOVE(ctdb->doneq, i);
371 i->callback(ctdb, i, i->priv_data);
375 DEBUG(ctdb, LOG_WARNING,
376 "Unexpected ctdbd request reply: operation %u reqid %u",
377 hdr->operation, hdr->reqid);
381 /* Remove "harmless" errors. */
382 static ssize_t real_error(ssize_t ret)
384 if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
389 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
395 if (holding_lock(ctdb)) {
396 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
399 if (revents & POLLOUT) {
401 if (real_error(write_io_elem(ctdb->fd,
402 ctdb->outq->io)) < 0) {
404 "ctdb_service: error writing to ctdbd");
408 if (io_elem_finished(ctdb->outq->io)) {
409 struct ctdb_request *done = ctdb->outq;
410 DLIST_REMOVE(ctdb->outq, done);
411 /* We add at the head: any dead ones
413 DLIST_ADD(ctdb->doneq, done);
418 while (revents & POLLIN) {
422 ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
425 "ctdb_service: allocating readbuf");
431 ret = read_io_elem(ctdb->fd, ctdb->in);
432 if (real_error(ret) < 0 || ret == 0) {
433 /* They closed fd? */
437 "ctdb_service: error reading from ctdbd");
440 } else if (ret < 0) {
441 /* No progress, stop loop. */
443 } else if (io_elem_finished(ctdb->in)) {
444 handle_incoming(ctdb, ctdb->in);
452 /* This is inefficient. We could pull in idtree.c. */
453 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
455 struct ctdb_request *i;
457 for (i = ctdb->outq; i; i = i->next) {
458 if (i->hdr.hdr->reqid == reqid) {
462 for (i = ctdb->doneq; i; i = i->next) {
463 if (i->hdr.hdr->reqid == reqid) {
470 uint32_t new_reqid(struct ctdb_connection *ctdb)
472 while (reqid_used(ctdb, ctdb->next_id)) {
475 return ctdb->next_id++;
478 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
481 const void *extra_data,
483 ctdb_callback_t callback,
486 struct ctdb_request *req;
487 struct ctdb_req_control *pkt;
489 req = new_ctdb_request(offsetof(struct ctdb_req_control, data) + extra, callback, cbdata);
493 io_elem_init_req_header(req->io,
494 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
496 pkt = req->hdr.control;
498 pkt->opcode = opcode;
502 pkt->datalen = extra;
503 memcpy(pkt->data, extra_data, extra);
504 DLIST_ADD(ctdb->outq, req);
508 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
509 struct ctdb_request *req,
512 ctdb_request_free(ctdb, req);
515 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
517 if (!req->next && !req->prev) {
518 DEBUG(ctdb, LOG_ALERT,
519 "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
520 req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
521 ctdb_request_free(ctdb, req);
525 DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
526 req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
528 /* FIXME: If it's not sent, we could just free it right now. */
529 req->callback = ctdb_cancel_callback;
532 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
534 cleanup_locks(ctdb, db);
539 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
540 struct ctdb_request *req,
543 struct ctdb_db *db = _db;
545 /* Do callback on original request. */
546 db->callback(ctdb, req->extra, db->private_data);
549 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
550 struct ctdb_request *req)
552 struct ctdb_request *dbpath_req = req->extra;
553 struct ctdb_reply_control *reply;
554 struct ctdb_db *db = req->priv_data;
555 uint32_t tdb_flags = db->tdb_flags;
556 struct tdb_logging_context log;
558 /* Never sent the dbpath request? We've failed. */
560 /* FIXME: Save errno? */
565 reply = unpack_reply_control(ctdb, dbpath_req, CTDB_CONTROL_GETDBPATH);
569 if (reply->status != 0) {
570 DEBUG(db->ctdb, LOG_ERR,
571 "ctdb_attachdb_recv: reply status %i", reply->status);
575 tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
576 tdb_flags |= TDB_DISALLOW_NESTING;
578 log.log_fn = ctdb_tdb_log_bridge;
579 log.log_private = ctdb;
580 db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
582 if (db->tdb == NULL) {
583 DEBUG(db->ctdb, LOG_ERR,
584 "ctdb_attachdb_recv: failed to tdb_open %s",
585 (char *)reply->data);
589 /* Finally, separate the db from the request (see destroy_req_db). */
590 req->priv_data = NULL;
591 DEBUG(db->ctdb, LOG_DEBUG,
592 "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
596 static void attachdb_done(struct ctdb_connection *ctdb,
597 struct ctdb_request *req,
600 struct ctdb_db *db = _db;
601 struct ctdb_request *req2;
602 struct ctdb_reply_control *reply;
603 enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
605 if (db->persistent) {
606 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
609 reply = unpack_reply_control(ctdb, req, control);
610 if (!reply || reply->status != 0) {
613 "ctdb_attachdb_send(async): DB_ATTACH status %i",
616 /* We failed. Hand request to user and have them discover it
617 * via ctdb_attachdb_recv. */
618 db->callback(ctdb, req, db->private_data);
621 db->id = *(uint32_t *)reply->data;
623 /* Now we do another call, to get the dbpath. */
624 req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
626 &db->id, sizeof(db->id),
627 attachdb_getdbpath_done, db);
629 DEBUG(db->ctdb, LOG_ERR,
630 "ctdb_attachdb_send(async): failed to allocate");
631 db->callback(ctdb, req, db->private_data);
636 DEBUG(db->ctdb, LOG_DEBUG,
637 "ctdb_attachdb_send(async): created getdbpath request");
640 static void destroy_req_db(struct ctdb_connection *ctdb,
641 struct ctdb_request *req)
643 /* Incomplete db is in priv_data. */
644 free(req->priv_data);
645 /* second request is chained off this one. */
647 ctdb_request_free(ctdb, req->extra);
651 struct ctdb_request *
652 ctdb_attachdb_send(struct ctdb_connection *ctdb,
653 const char *name, bool persistent, uint32_t tdb_flags,
654 ctdb_callback_t callback, void *private_data)
656 struct ctdb_request *req;
660 /* FIXME: Search if db already open. */
661 db = malloc(sizeof(*db));
667 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
669 opcode = CTDB_CONTROL_DB_ATTACH;
672 req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
673 strlen(name) + 1, attachdb_done, db);
676 "ctdb_attachdb_send: failed allocating DB_ATTACH");
682 db->tdb_flags = tdb_flags;
683 db->persistent = persistent;
684 db->callback = callback;
685 db->private_data = private_data;
687 req->extra_destructor = destroy_req_db;
688 /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
691 /* Flags get overloaded into srvid. */
692 req->hdr.control->srvid = tdb_flags;
693 DEBUG(db->ctdb, LOG_DEBUG,
694 "ctdb_attachdb_send: DB_ATTACH request %p", req);
698 static unsigned long lock_magic(struct ctdb_lock *lock)
700 /* A non-zero magic specific to this structure. */
701 return ((unsigned long)lock->key.dptr
702 ^ (((unsigned long)lock->key.dptr) << 16)
703 ^ 0xBADC0FFEEBADC0DEULL)
707 /* This is only called on locks before they're held. */
708 static void free_lock(struct ctdb_lock *lock)
710 if (lock->held_magic) {
711 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
712 "free_lock invalid lock %p", lock);
719 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
721 if (lock->held_magic != lock_magic(lock)) {
722 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
723 "ctdb_release_lock invalid lock %p", lock);
724 } else if (lock->ctdb_db != ctdb_db) {
726 DEBUG(ctdb_db->ctdb, LOG_ALERT,
727 "ctdb_release_lock: wrong ctdb_db.");
729 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
730 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
731 "ctdb_release_lock %p", lock);
732 remove_lock(lock->ctdb_db->ctdb, lock);
734 lock->held_magic = 0;
739 /* We keep the lock if local node is the dmaster. */
740 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
742 struct ctdb_ltdb_header *hdr;
744 if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
745 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
746 "ctdb_readrecordlock_async: failed to chainlock");
750 hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
751 if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
752 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
753 "ctdb_readrecordlock_async: got local lock");
754 lock->held_magic = lock_magic(lock);
756 add_lock(lock->ctdb_db->ctdb, lock);
760 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
765 /* If they shutdown before we hand them the lock, we free it here. */
766 static void destroy_lock(struct ctdb_connection *ctdb,
767 struct ctdb_request *req)
769 free_lock(req->extra);
772 static void readrecordlock_retry(struct ctdb_connection *ctdb,
773 struct ctdb_request *req, void *private)
775 struct ctdb_lock *lock = req->extra;
776 struct ctdb_reply_call *reply;
779 /* OK, we've received reply to noop migration */
780 reply = unpack_reply_call(ctdb, req, CTDB_NULL_FUNC);
781 if (!reply || reply->status != 0) {
784 "ctdb_readrecordlock_async(async):"
785 " NULL_FUNC returned %i", reply->status);
787 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
788 ctdb_request_free(ctdb, req); /* Also frees lock. */
792 /* Can we get lock now? */
793 if (try_readrecordlock(lock, &data)) {
794 /* Now it's their responsibility to free lock & request! */
795 req->extra_destructor = NULL;
796 lock->callback(lock->ctdb_db, lock, data, private);
797 ctdb_request_free(ctdb, req);
801 /* Retransmit the same request again (we lost race). */
802 io_elem_reset(req->io);
803 DLIST_ADD(ctdb->outq, req);
807 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
808 ctdb_rrl_callback_t callback, void *cbdata)
810 struct ctdb_request *req;
811 struct ctdb_lock *lock;
814 if (holding_lock(ctdb_db->ctdb)) {
815 DEBUG(ctdb_db->ctdb, LOG_ALERT,
816 "ctdb_readrecordlock_async: already holding lock");
821 lock = malloc(sizeof(*lock) + key.dsize);
823 DEBUG(ctdb_db->ctdb, LOG_ERR,
824 "ctdb_readrecordlock_async: lock allocation failed");
827 lock->key.dptr = (void *)(lock + 1);
828 memcpy(lock->key.dptr, key.dptr, key.dsize);
829 lock->key.dsize = key.dsize;
830 lock->ctdb_db = ctdb_db;
832 lock->held_magic = 0;
835 if (try_readrecordlock(lock, &data)) {
836 callback(ctdb_db, lock, data, cbdata);
840 /* Slow path: create request. */
841 req = new_ctdb_request(offsetof(struct ctdb_req_call, data)
842 + key.dsize, readrecordlock_retry, cbdata);
844 DEBUG(ctdb_db->ctdb, LOG_ERR,
845 "ctdb_readrecordlock_async: allocation failed");
850 req->extra_destructor = destroy_lock;
851 /* We store the original callback in the lock, and use our own. */
852 lock->callback = callback;
854 io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
855 new_reqid(ctdb_db->ctdb));
857 req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
858 req->hdr.call->db_id = ctdb_db->id;
859 req->hdr.call->callid = CTDB_NULL_FUNC;
860 req->hdr.call->hopcount = 0;
861 req->hdr.call->keylen = key.dsize;
862 req->hdr.call->calldatalen = 0;
863 memcpy(req->hdr.call->data, key.dptr, key.dsize);
864 DLIST_ADD(ctdb_db->ctdb->outq, req);
868 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
869 struct ctdb_lock *lock, TDB_DATA data)
871 if (lock->ctdb_db != ctdb_db) {
873 DEBUG(ctdb_db->ctdb, LOG_ALERT,
874 "ctdb_writerecord: Can not write, wrong ctdb_db.");
878 if (lock->held_magic != lock_magic(lock)) {
880 DEBUG(ctdb_db->ctdb, LOG_ALERT,
881 "ctdb_writerecord: Can not write. Lock has been released.");
885 if (ctdb_db->persistent) {
887 DEBUG(ctdb_db->ctdb, LOG_ALERT,
888 "ctdb_writerecord: cannot write to persistent db");
892 switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
894 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
895 "ctdb_writerecord: optimized away noop write.");
903 DEBUG(ctdb_db->ctdb, LOG_CRIT,
904 "ctdb_writerecord: out of memory.");
907 DEBUG(ctdb_db->ctdb, LOG_ALERT,
908 "ctdb_writerecord: record changed under lock?");
910 default: /* TDB already logged. */