Add rolling statistics that are collected across 10 second intervals.
[metze/ctdb/wip.git] / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19 #include <ctdb.h>
20 #include <poll.h>
21 #include <errno.h>
22 #include <unistd.h>
23 #include <fcntl.h>
24 #include <stdlib.h>
25 #include <sys/socket.h>
26 #include <sys/un.h>
27 #include "libctdb_private.h"
28 #include "io_elem.h"
29 #include "local_tdb.h"
30 #include "messages.h"
31 #include <dlinklist.h>
32 #include <ctdb_protocol.h>
33
34 /* Remove type-safety macros. */
35 #undef ctdb_attachdb_send
36 #undef ctdb_readrecordlock_async
37 #undef ctdb_connect
38
39 struct ctdb_lock {
40         struct ctdb_lock *next, *prev;
41
42         struct ctdb_db *ctdb_db;
43         TDB_DATA key;
44
45         /* This will always be set by the time user sees this. */
46         unsigned long held_magic;
47         struct ctdb_ltdb_header *hdr;
48
49         /* For convenience, we stash original callback here. */
50         ctdb_rrl_callback_t callback;
51 };
52
53 struct ctdb_db {
54         struct ctdb_connection *ctdb;
55         bool persistent;
56         uint32_t tdb_flags;
57         uint32_t id;
58         struct tdb_context *tdb;
59
60         ctdb_callback_t callback;
61         void *private_data;
62 };
63
64 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
65 {
66         DLIST_REMOVE(ctdb->locks, lock);
67 }
68
69 /* FIXME: for thread safety, need tid info too. */
70 static bool holding_lock(struct ctdb_connection *ctdb)
71 {
72         /* For the moment, you can't ever hold more than 1 lock. */
73         return (ctdb->locks != NULL);
74 }
75
76 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
77 {
78         DLIST_ADD(ctdb->locks, lock);
79 }
80
81 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
82 {
83         struct ctdb_lock *i, *next;
84
85         for (i = ctdb->locks; i; i = next) {
86                 /* Grab next pointer, as release_lock will free i */
87                 next = i->next;
88                 if (i->ctdb_db == db) {
89                         ctdb_release_lock(db, i);
90                 }
91         }
92 }
93
94 /* FIXME: Could be in shared util code with rest of ctdb */
95 static void close_noerr(int fd)
96 {
97         int olderr = errno;
98         close(fd);
99         errno = olderr;
100 }
101
102 /* FIXME: Could be in shared util code with rest of ctdb */
103 static void free_noerr(void *p)
104 {
105         int olderr = errno;
106         free(p);
107         errno = olderr;
108 }
109
110 /* FIXME: Could be in shared util code with rest of ctdb */
111 static void set_nonblocking(int fd)
112 {
113         unsigned v;
114         v = fcntl(fd, F_GETFL, 0);
115         fcntl(fd, F_SETFL, v | O_NONBLOCK);
116 }
117
118 /* FIXME: Could be in shared util code with rest of ctdb */
119 static void set_close_on_exec(int fd)
120 {
121         unsigned v;
122         v = fcntl(fd, F_GETFD, 0);
123         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
124 }
125
126 static void set_pnn(struct ctdb_connection *ctdb,
127                     struct ctdb_request *req,
128                     void *unused)
129 {
130         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
131                 DEBUG(ctdb, LOG_CRIT,
132                       "ctdb_connect(async): failed to get pnn");
133                 ctdb->broken = true;
134         }
135         ctdb_request_free(ctdb, req);
136 }
137
138 struct ctdb_connection *ctdb_connect(const char *addr,
139                                      ctdb_log_fn_t log_fn, void *log_priv)
140 {
141         struct ctdb_connection *ctdb;
142         struct sockaddr_un sun;
143
144         ctdb = malloc(sizeof(*ctdb));
145         if (!ctdb) {
146                 /* With no format string, we hope it doesn't use ap! */
147                 va_list ap;
148                 memset(&ap, 0, sizeof(ap));
149                 errno = ENOMEM;
150                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
151                 goto fail;
152         }
153         ctdb->outq = NULL;
154         ctdb->doneq = NULL;
155         ctdb->in = NULL;
156         ctdb->message_handlers = NULL;
157         ctdb->next_id = 0;
158         ctdb->broken = false;
159         ctdb->log = log_fn;
160         ctdb->log_priv = log_priv;
161         ctdb->locks = NULL;
162
163         memset(&sun, 0, sizeof(sun));
164         sun.sun_family = AF_UNIX;
165         if (!addr)
166                 addr = CTDB_PATH;
167         strncpy(sun.sun_path, addr, sizeof(sun.sun_path));
168         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
169         if (ctdb->fd < 0)
170                 goto free_fail;
171
172         set_nonblocking(ctdb->fd);
173         set_close_on_exec(ctdb->fd);
174
175         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
176                 goto close_fail;
177
178         /* Immediately queue a request to get our pnn. */
179         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
180                 goto close_fail;
181
182         return ctdb;
183
184 close_fail:
185         close_noerr(ctdb->fd);
186 free_fail:
187         free_noerr(ctdb);
188 fail:
189         return NULL;
190 }
191
192 void ctdb_disconnect(struct ctdb_connection *ctdb)
193 {
194         struct ctdb_request *i;
195
196         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
197
198         while ((i = ctdb->outq) != NULL) {
199                 DLIST_REMOVE(ctdb->outq, i);
200                 ctdb_request_free(ctdb, i);
201         }
202
203         while ((i = ctdb->doneq) != NULL) {
204                 DLIST_REMOVE(ctdb->doneq, i);
205                 ctdb_request_free(ctdb, i);
206         }
207
208         if (ctdb->in)
209                 free_io_elem(ctdb->in);
210
211         remove_message_handlers(ctdb);
212
213         close(ctdb->fd);
214         /* Just in case they try to reuse */
215         ctdb->fd = -1;
216         free(ctdb);
217 }
218
219 int ctdb_get_fd(struct ctdb_connection *ctdb)
220 {
221         return ctdb->fd;
222 }
223
224 int ctdb_which_events(struct ctdb_connection *ctdb)
225 {
226         int events = POLLIN;
227
228         if (ctdb->outq)
229                 events |= POLLOUT;
230         return events;
231 }
232
233 struct ctdb_request *new_ctdb_request(size_t len,
234                                       ctdb_callback_t cb, void *cbdata)
235 {
236         struct ctdb_request *req = malloc(sizeof(*req));
237         if (!req)
238                 return NULL;
239         req->io = new_io_elem(len);
240         if (!req->io) {
241                 free(req);
242                 return NULL;
243         }
244         req->hdr.hdr = io_elem_data(req->io, NULL);
245         req->reply = NULL;
246         req->callback = cb;
247         req->priv_data = cbdata;
248         req->extra = NULL;
249         req->extra_destructor = NULL;
250         return req;
251 }
252
253 void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
254 {
255         if (req->next || req->prev) {
256                 DEBUG(ctdb, LOG_ALERT,
257                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
258                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
259                 ctdb_cancel(ctdb, req);
260                 return;
261         }
262         if (req->extra_destructor) {
263                 req->extra_destructor(ctdb, req);
264         }
265         if (req->reply) {
266                 free_io_elem(req->reply);
267         }
268         free_io_elem(req->io);
269         free(req);
270 }
271
272 /* Sanity-checking wrapper for reply. */
273 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
274                                                  struct ctdb_request *req,
275                                                  uint32_t callid)
276 {
277         size_t len;
278         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
279
280         /* Library user error if this isn't a reply to a call. */
281         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
282                 errno = EINVAL;
283                 DEBUG(ctdb, LOG_ALERT,
284                       "This was not a ctdbd call request: operation %u",
285                       req->hdr.hdr->operation);
286                 return NULL;
287         }
288
289         if (req->hdr.call->callid != callid) {
290                 errno = EINVAL;
291                 DEBUG(ctdb, LOG_ALERT,
292                       "This was not a ctdbd %u call request: %u",
293                       callid, req->hdr.call->callid);
294                 return NULL;
295         }
296
297         /* ctdbd or our error if this isn't a reply call. */
298         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
299                 errno = EIO;
300                 DEBUG(ctdb, LOG_CRIT,
301                       "Invalid ctdbd call reply: len %zu, operation %u",
302                       len, inhdr->hdr.operation);
303                 return NULL;
304         }
305
306         return inhdr;
307 }
308
309 /* Sanity-checking wrapper for reply. */
310 struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
311                                                 struct ctdb_request *req,
312                                                 enum ctdb_controls control)
313 {
314         size_t len;
315         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
316
317         /* Library user error if this isn't a reply to a call. */
318         if (len < sizeof(*inhdr)) {
319                 errno = EINVAL;
320                 DEBUG(ctdb, LOG_ALERT,
321                       "Short ctdbd control reply: %zu bytes", len);
322                 return NULL;
323         }
324         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
325                 errno = EINVAL;
326                 DEBUG(ctdb, LOG_ALERT,
327                       "This was not a ctdbd control request: operation %u",
328                       req->hdr.hdr->operation);
329                 return NULL;
330         }
331
332         /* ... or if it was a different control from what we expected. */
333         if (req->hdr.control->opcode != control) {
334                 errno = EINVAL;
335                 DEBUG(ctdb, LOG_ALERT,
336                       "This was not an opcode %u ctdbd control request: %u",
337                       control, req->hdr.control->opcode);
338                 return NULL;
339         }
340
341         /* ctdbd or our error if this isn't a reply call. */
342         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
343                 errno = EIO;
344                 DEBUG(ctdb, LOG_CRIT,
345                       "Invalid ctdbd control reply: operation %u",
346                       inhdr->hdr.operation);
347                 return NULL;
348         }
349
350         return inhdr;
351 }
352
353 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
354 {
355         struct ctdb_req_header *hdr;
356         size_t len;
357         struct ctdb_request *i;
358
359         hdr = io_elem_data(in, &len);
360         /* FIXME: use len to check packet! */
361
362         if (hdr->operation == CTDB_REQ_MESSAGE) {
363                 deliver_message(ctdb, hdr);
364                 return;
365         }
366
367         for (i = ctdb->doneq; i; i = i->next) {
368                 if (i->hdr.hdr->reqid == hdr->reqid) {
369                         DLIST_REMOVE(ctdb->doneq, i);
370                         i->reply = in;
371                         i->callback(ctdb, i, i->priv_data);
372                         return;
373                 }
374         }
375         DEBUG(ctdb, LOG_WARNING,
376               "Unexpected ctdbd request reply: operation %u reqid %u",
377               hdr->operation, hdr->reqid);
378         free_io_elem(in);
379 }
380
381 /* Remove "harmless" errors. */
382 static ssize_t real_error(ssize_t ret)
383 {
384         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
385                 return 0;
386         return ret;
387 }
388
389 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
390 {
391         if (ctdb->broken) {
392                 return false;
393         }
394
395         if (holding_lock(ctdb)) {
396                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
397         }
398
399         if (revents & POLLOUT) {
400                 while (ctdb->outq) {
401                         if (real_error(write_io_elem(ctdb->fd,
402                                                      ctdb->outq->io)) < 0) {
403                                 DEBUG(ctdb, LOG_ERR,
404                                       "ctdb_service: error writing to ctdbd");
405                                 ctdb->broken = true;
406                                 return false;
407                         }
408                         if (io_elem_finished(ctdb->outq->io)) {
409                                 struct ctdb_request *done = ctdb->outq;
410                                 DLIST_REMOVE(ctdb->outq, done);
411                                 /* We add at the head: any dead ones
412                                  * sit and end. */
413                                 DLIST_ADD(ctdb->doneq, done);
414                         }
415                 }
416         }
417
418         while (revents & POLLIN) {
419                 int ret;
420
421                 if (!ctdb->in) {
422                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
423                         if (!ctdb->in) {
424                                 DEBUG(ctdb, LOG_ERR,
425                                       "ctdb_service: allocating readbuf");
426                                 ctdb->broken = true;
427                                 return false;
428                         }
429                 }
430
431                 ret = read_io_elem(ctdb->fd, ctdb->in);
432                 if (real_error(ret) < 0 || ret == 0) {
433                         /* They closed fd? */
434                         if (ret == 0)
435                                 errno = EBADF;
436                         DEBUG(ctdb, LOG_ERR,
437                               "ctdb_service: error reading from ctdbd");
438                         ctdb->broken = true;
439                         return false;
440                 } else if (ret < 0) {
441                         /* No progress, stop loop. */
442                         revents = 0;
443                 } else if (io_elem_finished(ctdb->in)) {
444                         handle_incoming(ctdb, ctdb->in);
445                         ctdb->in = NULL;
446                 }
447         }
448
449         return true;
450 }
451
452 /* This is inefficient.  We could pull in idtree.c. */
453 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
454 {
455         struct ctdb_request *i;
456
457         for (i = ctdb->outq; i; i = i->next) {
458                 if (i->hdr.hdr->reqid == reqid) {
459                         return true;
460                 }
461         }
462         for (i = ctdb->doneq; i; i = i->next) {
463                 if (i->hdr.hdr->reqid == reqid) {
464                         return true;
465                 }
466         }
467         return false;
468 }
469
470 uint32_t new_reqid(struct ctdb_connection *ctdb)
471 {
472         while (reqid_used(ctdb, ctdb->next_id)) {
473                 ctdb->next_id++;
474         }
475         return ctdb->next_id++;
476 }
477
478 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
479                                               uint32_t opcode,
480                                               uint32_t destnode,
481                                               const void *extra_data,
482                                               size_t extra,
483                                               ctdb_callback_t callback,
484                                               void *cbdata)
485 {
486         struct ctdb_request *req;
487         struct ctdb_req_control *pkt;
488
489         req = new_ctdb_request(offsetof(struct ctdb_req_control, data) + extra, callback, cbdata);
490         if (!req)
491                 return NULL;
492
493         io_elem_init_req_header(req->io,
494                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
495
496         pkt = req->hdr.control;
497         pkt->pad = 0;
498         pkt->opcode = opcode;
499         pkt->srvid = 0;
500         pkt->client_id = 0;
501         pkt->flags = 0;
502         pkt->datalen = extra;
503         memcpy(pkt->data, extra_data, extra);
504         DLIST_ADD(ctdb->outq, req);
505         return req;
506 }
507
508 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
509                           struct ctdb_request *req,
510                           void *unused)
511 {
512         ctdb_request_free(ctdb, req);
513 }
514
515 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
516 {
517         if (!req->next && !req->prev) {
518                 DEBUG(ctdb, LOG_ALERT,
519                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
520                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
521                 ctdb_request_free(ctdb, req);
522                 return;
523         }
524
525         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
526               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
527
528         /* FIXME: If it's not sent, we could just free it right now. */
529         req->callback = ctdb_cancel_callback;
530 }
531
532 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
533 {
534         cleanup_locks(ctdb, db);
535         tdb_close(db->tdb);
536         free(db);
537 }
538
539 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
540                                     struct ctdb_request *req,
541                                     void *_db)
542 {
543         struct ctdb_db *db = _db;
544
545         /* Do callback on original request. */
546         db->callback(ctdb, req->extra, db->private_data);
547 }
548
549 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
550                                    struct ctdb_request *req)
551 {
552         struct ctdb_request *dbpath_req = req->extra;
553         struct ctdb_reply_control *reply;
554         struct ctdb_db *db = req->priv_data;
555         uint32_t tdb_flags = db->tdb_flags;
556         struct tdb_logging_context log;
557
558         /* Never sent the dbpath request?  We've failed. */
559         if (!dbpath_req) {
560                 /* FIXME: Save errno? */
561                 errno = EINVAL;
562                 return NULL;
563         }
564
565         reply = unpack_reply_control(ctdb, dbpath_req, CTDB_CONTROL_GETDBPATH);
566         if (!reply) {
567                 return NULL;
568         }
569         if (reply->status != 0) {
570                 DEBUG(db->ctdb, LOG_ERR,
571                       "ctdb_attachdb_recv: reply status %i", reply->status);
572                 return NULL;
573         }
574
575         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
576         tdb_flags |= TDB_DISALLOW_NESTING;
577
578         log.log_fn = ctdb_tdb_log_bridge;
579         log.log_private = ctdb;
580         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
581                               &log, NULL);
582         if (db->tdb == NULL) {
583                 DEBUG(db->ctdb, LOG_ERR,
584                       "ctdb_attachdb_recv: failed to tdb_open %s",
585                       (char *)reply->data);
586                 return NULL;
587         }
588
589         /* Finally, separate the db from the request (see destroy_req_db). */
590         req->priv_data = NULL;
591         DEBUG(db->ctdb, LOG_DEBUG,
592               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
593         return db;
594 }
595
596 static void attachdb_done(struct ctdb_connection *ctdb,
597                           struct ctdb_request *req,
598                           void *_db)
599 {
600         struct ctdb_db *db = _db;
601         struct ctdb_request *req2;
602         struct ctdb_reply_control *reply;
603         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
604
605         if (db->persistent) {
606                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
607         }
608
609         reply = unpack_reply_control(ctdb, req, control);
610         if (!reply || reply->status != 0) {
611                 if (reply) {
612                         DEBUG(ctdb, LOG_ERR,
613                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
614                               reply->status);
615                 }
616                 /* We failed.  Hand request to user and have them discover it
617                  * via ctdb_attachdb_recv. */
618                 db->callback(ctdb, req, db->private_data);
619                 return;
620         }
621         db->id = *(uint32_t *)reply->data;
622
623         /* Now we do another call, to get the dbpath. */
624         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
625                                         CTDB_CURRENT_NODE,
626                                         &db->id, sizeof(db->id),
627                                         attachdb_getdbpath_done, db);
628         if (!req2) {
629                 DEBUG(db->ctdb, LOG_ERR,
630                       "ctdb_attachdb_send(async): failed to allocate");
631                 db->callback(ctdb, req, db->private_data);
632                 return;
633         }
634         req->extra = req2;
635         req2->extra = req;
636         DEBUG(db->ctdb, LOG_DEBUG,
637               "ctdb_attachdb_send(async): created getdbpath request");
638 }
639
640 static void destroy_req_db(struct ctdb_connection *ctdb,
641                            struct ctdb_request *req)
642 {
643         /* Incomplete db is in priv_data. */
644         free(req->priv_data);
645         /* second request is chained off this one. */
646         if (req->extra) {
647                 ctdb_request_free(ctdb, req->extra);
648         }
649 }
650
651 struct ctdb_request *
652 ctdb_attachdb_send(struct ctdb_connection *ctdb,
653                    const char *name, bool persistent, uint32_t tdb_flags,
654                    ctdb_callback_t callback, void *private_data)
655 {
656         struct ctdb_request *req;
657         struct ctdb_db *db;
658         uint32_t opcode;
659
660         /* FIXME: Search if db already open. */
661         db = malloc(sizeof(*db));
662         if (!db) {
663                 return NULL;
664         }
665
666         if (persistent) {
667                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
668         } else {
669                 opcode = CTDB_CONTROL_DB_ATTACH;
670         }
671
672         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
673                                        strlen(name) + 1, attachdb_done, db);
674         if (!req) {
675                 DEBUG(ctdb, LOG_ERR,
676                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
677                 free(db);
678                 return NULL;
679         }
680
681         db->ctdb = ctdb;
682         db->tdb_flags = tdb_flags;
683         db->persistent = persistent;
684         db->callback = callback;
685         db->private_data = private_data;
686
687         req->extra_destructor = destroy_req_db;
688         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
689         req->extra = NULL;
690
691         /* Flags get overloaded into srvid. */
692         req->hdr.control->srvid = tdb_flags;
693         DEBUG(db->ctdb, LOG_DEBUG,
694               "ctdb_attachdb_send: DB_ATTACH request %p", req);
695         return req;
696 }
697
698 static unsigned long lock_magic(struct ctdb_lock *lock)
699 {
700         /* A non-zero magic specific to this structure. */
701         return ((unsigned long)lock->key.dptr
702                 ^ (((unsigned long)lock->key.dptr) << 16)
703                 ^ 0xBADC0FFEEBADC0DEULL)
704                 | 1;
705 }
706
707 /* This is only called on locks before they're held. */
708 static void free_lock(struct ctdb_lock *lock)
709 {
710         if (lock->held_magic) {
711                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
712                       "free_lock invalid lock %p", lock);
713         }
714         free(lock->hdr);
715         free(lock);
716 }
717
718
719 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
720 {
721         if (lock->held_magic != lock_magic(lock)) {
722                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
723                       "ctdb_release_lock invalid lock %p", lock);
724         } else if (lock->ctdb_db != ctdb_db) {
725                 errno = EBADF;
726                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
727                       "ctdb_release_lock: wrong ctdb_db.");
728         } else {
729                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
730                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
731                       "ctdb_release_lock %p", lock);
732                 remove_lock(lock->ctdb_db->ctdb, lock);
733         }
734         lock->held_magic = 0;
735         free_lock(lock);
736 }
737
738
739 /* We keep the lock if local node is the dmaster. */
740 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
741 {
742         struct ctdb_ltdb_header *hdr;
743
744         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
745                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
746                       "ctdb_readrecordlock_async: failed to chainlock");
747                 return NULL;
748         }
749
750         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
751         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
752                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
753                       "ctdb_readrecordlock_async: got local lock");
754                 lock->held_magic = lock_magic(lock);
755                 lock->hdr = hdr;
756                 add_lock(lock->ctdb_db->ctdb, lock);
757                 return true;
758         }
759
760         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
761         free(hdr);
762         return NULL;
763 }
764
765 /* If they shutdown before we hand them the lock, we free it here. */
766 static void destroy_lock(struct ctdb_connection *ctdb,
767                          struct ctdb_request *req)
768 {
769         free_lock(req->extra);
770 }
771
772 static void readrecordlock_retry(struct ctdb_connection *ctdb,
773                                  struct ctdb_request *req, void *private)
774 {
775         struct ctdb_lock *lock = req->extra;
776         struct ctdb_reply_call *reply;
777         TDB_DATA data;
778
779         /* OK, we've received reply to noop migration */
780         reply = unpack_reply_call(ctdb, req, CTDB_NULL_FUNC);
781         if (!reply || reply->status != 0) {
782                 if (reply) {
783                         DEBUG(ctdb, LOG_ERR,
784                               "ctdb_readrecordlock_async(async):"
785                               " NULL_FUNC returned %i", reply->status);
786                 }
787                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
788                 ctdb_request_free(ctdb, req); /* Also frees lock. */
789                 return;
790         }
791
792         /* Can we get lock now? */
793         if (try_readrecordlock(lock, &data)) {
794                 /* Now it's their responsibility to free lock & request! */
795                 req->extra_destructor = NULL;
796                 lock->callback(lock->ctdb_db, lock, data, private);
797                 ctdb_request_free(ctdb, req);
798                 return;
799         }
800
801         /* Retransmit the same request again (we lost race). */
802         io_elem_reset(req->io);
803         DLIST_ADD(ctdb->outq, req);
804 }
805
806 bool
807 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
808                           ctdb_rrl_callback_t callback, void *cbdata)
809 {
810         struct ctdb_request *req;
811         struct ctdb_lock *lock;
812         TDB_DATA data;
813
814         if (holding_lock(ctdb_db->ctdb)) {
815                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
816                       "ctdb_readrecordlock_async: already holding lock");
817                 return false;
818         }
819
820         /* Setup lock */
821         lock = malloc(sizeof(*lock) + key.dsize);
822         if (!lock) {
823                 DEBUG(ctdb_db->ctdb, LOG_ERR,
824                       "ctdb_readrecordlock_async: lock allocation failed");
825                 return false;
826         }
827         lock->key.dptr = (void *)(lock + 1);
828         memcpy(lock->key.dptr, key.dptr, key.dsize);
829         lock->key.dsize = key.dsize;
830         lock->ctdb_db = ctdb_db;
831         lock->hdr = NULL;
832         lock->held_magic = 0;
833
834         /* Fast path. */
835         if (try_readrecordlock(lock, &data)) {
836                 callback(ctdb_db, lock, data, cbdata);
837                 return true;
838         }
839
840         /* Slow path: create request. */
841         req = new_ctdb_request(offsetof(struct ctdb_req_call, data)
842                                + key.dsize, readrecordlock_retry, cbdata);
843         if (!req) {
844                 DEBUG(ctdb_db->ctdb, LOG_ERR,
845                       "ctdb_readrecordlock_async: allocation failed");
846                 free_lock(lock);
847                 return NULL;
848         }
849         req->extra = lock;
850         req->extra_destructor = destroy_lock;
851         /* We store the original callback in the lock, and use our own. */
852         lock->callback = callback;
853
854         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
855                                 new_reqid(ctdb_db->ctdb));
856
857         req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
858         req->hdr.call->db_id = ctdb_db->id;
859         req->hdr.call->callid = CTDB_NULL_FUNC;
860         req->hdr.call->hopcount = 0;
861         req->hdr.call->keylen = key.dsize;
862         req->hdr.call->calldatalen = 0;
863         memcpy(req->hdr.call->data, key.dptr, key.dsize);
864         DLIST_ADD(ctdb_db->ctdb->outq, req);
865         return true;
866 }
867
868 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
869                       struct ctdb_lock *lock, TDB_DATA data)
870 {
871         if (lock->ctdb_db != ctdb_db) {
872                 errno = EBADF;
873                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
874                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
875                 return false;
876         }
877
878         if (lock->held_magic != lock_magic(lock)) {
879                 errno = EBADF;
880                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
881                       "ctdb_writerecord: Can not write. Lock has been released.");
882                 return false;
883         }
884                 
885         if (ctdb_db->persistent) {
886                 errno = EINVAL;
887                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
888                       "ctdb_writerecord: cannot write to persistent db");
889                 return false;
890         }
891
892         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
893         case 0:
894                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
895                       "ctdb_writerecord: optimized away noop write.");
896                 /* fall thru */
897         case 1:
898                 return true;
899
900         default:
901                 switch (errno) {
902                 case ENOMEM:
903                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
904                               "ctdb_writerecord: out of memory.");
905                         break;
906                 case EINVAL:
907                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
908                               "ctdb_writerecord: record changed under lock?");
909                         break;
910                 default: /* TDB already logged. */
911                         break;
912                 }
913                 return false;
914         }
915 }