ReadOnly: If record does not exist, upgrade to write-lock
[sahlberg/ctdb.git] / libctdb / ctdb.c
index 11f0549647c42fbad39aedb2ea9b40b1c278da25..099ceac0877523720885625940e8c05c9c10d4db 100644 (file)
@@ -2,6 +2,7 @@
    core of libctdb
 
    Copyright (C) Rusty Russell 2010
+   Copyright (C) Ronnie Sahlberg 2011
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,6 +25,7 @@
 #include <stdlib.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <sys/ioctl.h>
 #include "libctdb_private.h"
 #include "io_elem.h"
 #include "local_tdb.h"
@@ -34,6 +36,7 @@
 /* Remove type-safety macros. */
 #undef ctdb_attachdb_send
 #undef ctdb_readrecordlock_async
+#undef ctdb_readonlyrecordlock_async
 #undef ctdb_connect
 
 struct ctdb_lock {
@@ -42,6 +45,9 @@ struct ctdb_lock {
        struct ctdb_db *ctdb_db;
        TDB_DATA key;
 
+       /* Is this a request for read-only lock ? */
+       bool readonly;
+
        /* This will always be set by the time user sees this. */
        unsigned long held_magic;
        struct ctdb_ltdb_header *hdr;
@@ -50,6 +56,17 @@ struct ctdb_lock {
        ctdb_rrl_callback_t callback;
 };
 
+struct ctdb_db {
+       struct ctdb_connection *ctdb;
+       bool persistent;
+       uint32_t tdb_flags;
+       uint32_t id;
+       struct tdb_context *tdb;
+
+       ctdb_callback_t callback;
+       void *private_data;
+};
+
 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
 {
        DLIST_REMOVE(ctdb->locks, lock);
@@ -67,6 +84,19 @@ static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
        DLIST_ADD(ctdb->locks, lock);
 }
 
+static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
+{
+       struct ctdb_lock *i, *next;
+
+       for (i = ctdb->locks; i; i = next) {
+               /* Grab next pointer, as release_lock will free i */
+               next = i->next;
+               if (i->ctdb_db == db) {
+                       ctdb_release_lock(db, i);
+               }
+       }
+}
+
 /* FIXME: Could be in shared util code with rest of ctdb */
 static void close_noerr(int fd)
 {
@@ -108,7 +138,7 @@ static void set_pnn(struct ctdb_connection *ctdb,
                      "ctdb_connect(async): failed to get pnn");
                ctdb->broken = true;
        }
-       ctdb_request_free(ctdb, req);
+       ctdb_request_free(req);
 }
 
 struct ctdb_connection *ctdb_connect(const char *addr,
@@ -126,9 +156,11 @@ struct ctdb_connection *ctdb_connect(const char *addr,
                log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
                goto fail;
        }
+       ctdb->pnn = -1;
        ctdb->outq = NULL;
        ctdb->doneq = NULL;
        ctdb->in = NULL;
+       ctdb->inqueue = NULL;
        ctdb->message_handlers = NULL;
        ctdb->next_id = 0;
        ctdb->broken = false;
@@ -140,7 +172,7 @@ struct ctdb_connection *ctdb_connect(const char *addr,
        sun.sun_family = AF_UNIX;
        if (!addr)
                addr = CTDB_PATH;
-       strncpy(sun.sun_path, addr, sizeof(sun.sun_path));
+       strncpy(sun.sun_path, addr, sizeof(sun.sun_path)-1);
        ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (ctdb->fd < 0)
                goto free_fail;
@@ -165,6 +197,33 @@ fail:
        return NULL;
 }
 
+void ctdb_disconnect(struct ctdb_connection *ctdb)
+{
+       struct ctdb_request *i;
+
+       DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
+
+       while ((i = ctdb->outq) != NULL) {
+               DLIST_REMOVE(ctdb->outq, i);
+               ctdb_request_free(i);
+       }
+
+       while ((i = ctdb->doneq) != NULL) {
+               DLIST_REMOVE(ctdb->doneq, i);
+               ctdb_request_free(i);
+       }
+
+       if (ctdb->in)
+               free_io_elem(ctdb->in);
+
+       remove_message_handlers(ctdb);
+
+       close(ctdb->fd);
+       /* Just in case they try to reuse */
+       ctdb->fd = -1;
+       free(ctdb);
+}
+
 int ctdb_get_fd(struct ctdb_connection *ctdb)
 {
        return ctdb->fd;
@@ -179,7 +238,7 @@ int ctdb_which_events(struct ctdb_connection *ctdb)
        return events;
 }
 
-struct ctdb_request *new_ctdb_request(size_t len,
+struct ctdb_request *new_ctdb_request(struct ctdb_connection *ctdb, size_t len,
                                      ctdb_callback_t cb, void *cbdata)
 {
        struct ctdb_request *req = malloc(sizeof(*req));
@@ -190,6 +249,7 @@ struct ctdb_request *new_ctdb_request(size_t len,
                free(req);
                return NULL;
        }
+       req->ctdb = ctdb;
        req->hdr.hdr = io_elem_data(req->io, NULL);
        req->reply = NULL;
        req->callback = cb;
@@ -199,8 +259,17 @@ struct ctdb_request *new_ctdb_request(size_t len,
        return req;
 }
 
-void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
+void ctdb_request_free(struct ctdb_request *req)
 {
+       struct ctdb_connection *ctdb = req->ctdb;
+
+       if (req->next || req->prev) {
+               DEBUG(ctdb, LOG_ALERT,
+                     "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
+                     req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
+               ctdb_cancel(ctdb, req);
+               return;
+       }
        if (req->extra_destructor) {
                req->extra_destructor(ctdb, req);
        }
@@ -212,8 +281,7 @@ void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
 }
 
 /* Sanity-checking wrapper for reply. */
-static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
-                                                struct ctdb_request *req,
+static struct ctdb_reply_call *unpack_reply_call(struct ctdb_request *req,
                                                 uint32_t callid)
 {
        size_t len;
@@ -222,7 +290,7 @@ static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
        /* Library user error if this isn't a reply to a call. */
        if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
                errno = EINVAL;
-               DEBUG(ctdb, LOG_ERR,
+               DEBUG(req->ctdb, LOG_ALERT,
                      "This was not a ctdbd call request: operation %u",
                      req->hdr.hdr->operation);
                return NULL;
@@ -230,7 +298,7 @@ static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
 
        if (req->hdr.call->callid != callid) {
                errno = EINVAL;
-               DEBUG(ctdb, LOG_ERR,
+               DEBUG(req->ctdb, LOG_ALERT,
                      "This was not a ctdbd %u call request: %u",
                      callid, req->hdr.call->callid);
                return NULL;
@@ -239,7 +307,7 @@ static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
        /* ctdbd or our error if this isn't a reply call. */
        if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
                errno = EIO;
-               DEBUG(ctdb, LOG_CRIT,
+               DEBUG(req->ctdb, LOG_CRIT,
                      "Invalid ctdbd call reply: len %zu, operation %u",
                      len, inhdr->hdr.operation);
                return NULL;
@@ -249,8 +317,7 @@ static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
 }
 
 /* Sanity-checking wrapper for reply. */
-struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
-                                               struct ctdb_request *req,
+struct ctdb_reply_control *unpack_reply_control(struct ctdb_request *req,
                                                enum ctdb_controls control)
 {
        size_t len;
@@ -259,13 +326,13 @@ struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
        /* Library user error if this isn't a reply to a call. */
        if (len < sizeof(*inhdr)) {
                errno = EINVAL;
-               DEBUG(ctdb, LOG_CRIT,
+               DEBUG(req->ctdb, LOG_ALERT,
                      "Short ctdbd control reply: %zu bytes", len);
                return NULL;
        }
        if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
                errno = EINVAL;
-               DEBUG(ctdb, LOG_ERR,
+               DEBUG(req->ctdb, LOG_ALERT,
                      "This was not a ctdbd control request: operation %u",
                      req->hdr.hdr->operation);
                return NULL;
@@ -274,7 +341,7 @@ struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
        /* ... or if it was a different control from what we expected. */
        if (req->hdr.control->opcode != control) {
                errno = EINVAL;
-               DEBUG(ctdb, LOG_ERR,
+               DEBUG(req->ctdb, LOG_ALERT,
                      "This was not an opcode %u ctdbd control request: %u",
                      control, req->hdr.control->opcode);
                return NULL;
@@ -283,7 +350,7 @@ struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
        /* ctdbd or our error if this isn't a reply call. */
        if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
                errno = EIO;
-               DEBUG(ctdb, LOG_CRIT,
+               DEBUG(req->ctdb, LOG_CRIT,
                      "Invalid ctdbd control reply: operation %u",
                      inhdr->hdr.operation);
                return NULL;
@@ -335,7 +402,7 @@ bool ctdb_service(struct ctdb_connection *ctdb, int revents)
        }
 
        if (holding_lock(ctdb)) {
-               DEBUG(ctdb, LOG_WARNING, "Do not block while holding lock!");
+               DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
        }
 
        if (revents & POLLOUT) {
@@ -359,6 +426,19 @@ bool ctdb_service(struct ctdb_connection *ctdb, int revents)
 
        while (revents & POLLIN) {
                int ret;
+               int num_ready = 0;
+
+               if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
+                       DEBUG(ctdb, LOG_ERR,
+                             "ctdb_service: ioctl(FIONREAD) %d", errno);
+                       ctdb->broken = true;
+                       return false;
+               }
+               if (num_ready == 0) {
+                       /* the descriptor has been closed or we have all our data */
+                       break;
+               }
+
 
                if (!ctdb->in) {
                        ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
@@ -381,13 +461,21 @@ bool ctdb_service(struct ctdb_connection *ctdb, int revents)
                        return false;
                } else if (ret < 0) {
                        /* No progress, stop loop. */
-                       revents = 0;
+                       break;
                } else if (io_elem_finished(ctdb->in)) {
-                       handle_incoming(ctdb, ctdb->in);
+                       io_elem_queue(ctdb, ctdb->in);
                        ctdb->in = NULL;
                }
        }
 
+
+       while (ctdb->inqueue != NULL) {
+               struct io_elem *io = ctdb->inqueue;
+
+               io_elem_dequeue(ctdb, io);
+               handle_incoming(ctdb, io);
+       }
+
        return true;
 }
 
@@ -428,7 +516,9 @@ struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
        struct ctdb_request *req;
        struct ctdb_req_control *pkt;
 
-       req = new_ctdb_request(offsetof(struct ctdb_req_control, data) + extra, callback, cbdata);
+       req = new_ctdb_request(
+               ctdb, offsetof(struct ctdb_req_control, data) + extra,
+               callback, cbdata);
        if (!req)
                return NULL;
 
@@ -451,11 +541,19 @@ void ctdb_cancel_callback(struct ctdb_connection *ctdb,
                          struct ctdb_request *req,
                          void *unused)
 {
-       ctdb_request_free(ctdb, req);
+       ctdb_request_free(req);
 }
 
 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
 {
+       if (!req->next && !req->prev) {
+               DEBUG(ctdb, LOG_ALERT,
+                     "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
+                     req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
+               ctdb_request_free(req);
+               return;
+       }
+
        DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
              req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
 
@@ -463,72 +561,78 @@ void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
        req->callback = ctdb_cancel_callback;
 }
 
-struct ctdb_db {
-       struct ctdb_connection *ctdb;
-       bool persistent;
-       uint32_t tdb_flags;
-       uint32_t id;
-       struct tdb_context *tdb;
-
-       /* The lock we are holding, if any (we can only have one!) */
-       struct ctdb_lock *lock;
-
-       ctdb_callback_t callback;
-       void *private_data;
-};
+void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
+{
+       cleanup_locks(ctdb, db);
+       tdb_close(db->tdb);
+       free(db);
+}
 
+static void destroy_req_db(struct ctdb_connection *ctdb,
+                          struct ctdb_request *req);
+static void attachdb_done(struct ctdb_connection *ctdb,
+                         struct ctdb_request *req,
+                         void *_db);
 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
                                    struct ctdb_request *req,
-                                   void *_db)
-{
-       struct ctdb_db *db = _db;
-
-       /* Do callback on original request. */
-       db->callback(ctdb, req->extra, db->private_data);
-}
+                                   void *_db);
 
-struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
-                                  struct ctdb_request *req)
+struct ctdb_request *
+ctdb_attachdb_send(struct ctdb_connection *ctdb,
+                  const char *name, bool persistent, uint32_t tdb_flags,
+                  ctdb_callback_t callback, void *private_data)
 {
-       struct ctdb_request *dbpath_req = req->extra;
-       struct ctdb_reply_control *reply;
-       struct ctdb_db *db = req->priv_data;
-       uint32_t tdb_flags = db->tdb_flags;
+       struct ctdb_request *req;
+       struct ctdb_db *db;
+       uint32_t opcode;
 
-       /* Never sent the dbpath request?  We've failed. */
-       if (!dbpath_req) {
-               /* FIXME: Save errno? */
-               errno = EINVAL;
+       /* FIXME: Search if db already open. */
+       db = malloc(sizeof(*db));
+       if (!db) {
                return NULL;
        }
 
-       reply = unpack_reply_control(ctdb, dbpath_req, CTDB_CONTROL_GETDBPATH);
-       if (!reply) {
-               return NULL;
+       if (persistent) {
+               opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
+       } else {
+               opcode = CTDB_CONTROL_DB_ATTACH;
        }
-       if (reply->status != 0) {
-               DEBUG(db->ctdb, LOG_ERR,
-                     "ctdb_attachdb_recv: reply status %i", reply->status);
+
+       req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
+                                      strlen(name) + 1, attachdb_done, db);
+       if (!req) {
+               DEBUG(ctdb, LOG_ERR,
+                     "ctdb_attachdb_send: failed allocating DB_ATTACH");
+               free(db);
                return NULL;
        }
 
-       tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
-       tdb_flags |= TDB_DISALLOW_NESTING;
+       db->ctdb = ctdb;
+       db->tdb_flags = tdb_flags;
+       db->persistent = persistent;
+       db->callback = callback;
+       db->private_data = private_data;
 
-       /* FIXME: Setup logging to go through our logging. */
-       db->tdb = tdb_open((char *)reply->data, 0, tdb_flags, O_RDWR, 0);
-       if (db->tdb == NULL) {
-               DEBUG(db->ctdb, LOG_ERR,
-                     "ctdb_attachdb_recv: failed to tdb_open %s",
-                     (char *)reply->data);
-               return NULL;
-       }
+       req->extra_destructor = destroy_req_db;
+       /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
+       req->extra = NULL;
 
-       /* Finally, separate the db from the request (see destroy_req_db). */
-       req->priv_data = NULL;
+       /* Flags get overloaded into srvid. */
+       req->hdr.control->srvid = tdb_flags;
        DEBUG(db->ctdb, LOG_DEBUG,
-             "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
-       return db;
+             "ctdb_attachdb_send: DB_ATTACH request %p", req);
+       return req;
+}
+
+static void destroy_req_db(struct ctdb_connection *ctdb,
+                          struct ctdb_request *req)
+{
+       /* Incomplete db is in priv_data. */
+       free(req->priv_data);
+       /* second request is chained off this one. */
+       if (req->extra) {
+               ctdb_request_free(req->extra);
+       }
 }
 
 static void attachdb_done(struct ctdb_connection *ctdb,
@@ -544,7 +648,7 @@ static void attachdb_done(struct ctdb_connection *ctdb,
                control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
        }
 
-       reply = unpack_reply_control(ctdb, req, control);
+       reply = unpack_reply_control(req, control);
        if (!reply || reply->status != 0) {
                if (reply) {
                        DEBUG(ctdb, LOG_ERR,
@@ -575,62 +679,61 @@ static void attachdb_done(struct ctdb_connection *ctdb,
              "ctdb_attachdb_send(async): created getdbpath request");
 }
 
-static void destroy_req_db(struct ctdb_connection *ctdb,
-                          struct ctdb_request *req)
+static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
+                                   struct ctdb_request *req,
+                                   void *_db)
 {
-       /* Incomplete db is in priv_data. */
-       free(req->priv_data);
-       /* second request is chained off this one. */
-       if (req->extra) {
-               ctdb_request_free(ctdb, req->extra);
-       }
+       struct ctdb_db *db = _db;
+
+       /* Do callback on original request. */
+       db->callback(ctdb, req->extra, db->private_data);
 }
 
-struct ctdb_request *
-ctdb_attachdb_send(struct ctdb_connection *ctdb,
-                  const char *name, int persistent, uint32_t tdb_flags,
-                  ctdb_callback_t callback, void *private_data)
+struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
+                                  struct ctdb_request *req)
 {
-       struct ctdb_request *req;
-       struct ctdb_db *db;
-       uint32_t opcode;
+       struct ctdb_request *dbpath_req = req->extra;
+       struct ctdb_reply_control *reply;
+       struct ctdb_db *db = req->priv_data;
+       uint32_t tdb_flags = db->tdb_flags;
+       struct tdb_logging_context log;
 
-       /* FIXME: Search if db already open. */
-       db = malloc(sizeof(*db));
-       if (!db) {
+       /* Never sent the dbpath request?  We've failed. */
+       if (!dbpath_req) {
+               /* FIXME: Save errno? */
+               errno = EINVAL;
                return NULL;
        }
 
-       if (persistent) {
-               opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
-       } else {
-               opcode = CTDB_CONTROL_DB_ATTACH;
+       reply = unpack_reply_control(dbpath_req, CTDB_CONTROL_GETDBPATH);
+       if (!reply) {
+               return NULL;
        }
-
-       req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
-                                      strlen(name) + 1, attachdb_done, db);
-       if (!req) {
+       if (reply->status != 0) {
                DEBUG(db->ctdb, LOG_ERR,
-                     "ctdb_attachdb_send: failed allocating DB_ATTACH");
-               free(db);
+                     "ctdb_attachdb_recv: reply status %i", reply->status);
                return NULL;
        }
 
-       db->ctdb = ctdb;
-       db->tdb_flags = tdb_flags;
-       db->persistent = persistent;
-       db->callback = callback;
-       db->private_data = private_data;
+       tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
+       tdb_flags |= TDB_DISALLOW_NESTING;
 
-       req->extra_destructor = destroy_req_db;
-       /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
-       req->extra = NULL;
+       log.log_fn = ctdb_tdb_log_bridge;
+       log.log_private = ctdb;
+       db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
+                             &log, NULL);
+       if (db->tdb == NULL) {
+               DEBUG(db->ctdb, LOG_ERR,
+                     "ctdb_attachdb_recv: failed to tdb_open %s",
+                     (char *)reply->data);
+               return NULL;
+       }
 
-       /* Flags get overloaded into srvid. */
-       req->hdr.control->srvid = tdb_flags;
+       /* Finally, separate the db from the request (see destroy_req_db). */
+       req->priv_data = NULL;
        DEBUG(db->ctdb, LOG_DEBUG,
-             "ctdb_attachdb_send: DB_ATTACH request %p", req);
-       return req;
+             "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
+       return db;
 }
 
 static unsigned long lock_magic(struct ctdb_lock *lock)
@@ -646,7 +749,7 @@ static unsigned long lock_magic(struct ctdb_lock *lock)
 static void free_lock(struct ctdb_lock *lock)
 {
        if (lock->held_magic) {
-               DEBUG(lock->ctdb_db->ctdb, LOG_CRIT,
+               DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
                      "free_lock invalid lock %p", lock);
        }
        free(lock->hdr);
@@ -654,11 +757,15 @@ static void free_lock(struct ctdb_lock *lock)
 }
 
 
-void ctdb_release_lock(struct ctdb_lock *lock)
+void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
 {
        if (lock->held_magic != lock_magic(lock)) {
-               DEBUG(lock->ctdb_db->ctdb, LOG_CRIT,
+               DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
                      "ctdb_release_lock invalid lock %p", lock);
+       } else if (lock->ctdb_db != ctdb_db) {
+               errno = EBADF;
+               DEBUG(ctdb_db->ctdb, LOG_ALERT,
+                     "ctdb_release_lock: wrong ctdb_db.");
        } else {
                tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
                DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
@@ -682,6 +789,14 @@ static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
        }
 
        hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
+       if (hdr && lock->readonly && (hdr->flags & CTDB_REC_RO_HAVE_READONLY) ) {
+               DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
+                     "ctdb_readrecordlock_async: got local lock for ro");
+               lock->held_magic = lock_magic(lock);
+               lock->hdr = hdr;
+               add_lock(lock->ctdb_db->ctdb, lock);
+               return true;
+       }
        if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
                DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
                      "ctdb_readrecordlock_async: got local lock");
@@ -691,6 +806,13 @@ static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
                return true;
        }
 
+       /* we dont have the record locally,
+        * drop to writelock to force a migration
+        */
+       if (!hdr && lock->readonly) {
+               lock->readonly = false;
+       }
+
        tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
        free(hdr);
        return NULL;
@@ -710,16 +832,16 @@ static void readrecordlock_retry(struct ctdb_connection *ctdb,
        struct ctdb_reply_call *reply;
        TDB_DATA data;
 
-       /* OK, we've received reply to noop migration */
-       reply = unpack_reply_call(ctdb, req, CTDB_NULL_FUNC);
+       /* OK, we've received reply to fetch-with-header migration */
+       reply = unpack_reply_call(req, CTDB_FETCH_WITH_HEADER_FUNC);
        if (!reply || reply->status != 0) {
                if (reply) {
                        DEBUG(ctdb, LOG_ERR,
                              "ctdb_readrecordlock_async(async):"
-                             " NULL_FUNC returned %i", reply->status);
+                             " FETCH_WITH_HEADER_FUNC returned %i", reply->status);
                }
                lock->callback(lock->ctdb_db, NULL, tdb_null, private);
-               ctdb_request_free(ctdb, req); /* Also frees lock. */
+               ctdb_request_free(req); /* Also frees lock. */
                return;
        }
 
@@ -728,6 +850,7 @@ static void readrecordlock_retry(struct ctdb_connection *ctdb,
                /* Now it's their responsibility to free lock & request! */
                req->extra_destructor = NULL;
                lock->callback(lock->ctdb_db, lock, data, private);
+               ctdb_request_free(req);
                return;
        }
 
@@ -736,16 +859,17 @@ static void readrecordlock_retry(struct ctdb_connection *ctdb,
        DLIST_ADD(ctdb->outq, req);
 }
 
-bool
-ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
-                         ctdb_rrl_callback_t callback, void *cbdata)
+static bool
+ctdb_readrecordlock_internal(struct ctdb_db *ctdb_db, TDB_DATA key,
+                            bool readonly,
+                            ctdb_rrl_callback_t callback, void *cbdata)
 {
        struct ctdb_request *req;
        struct ctdb_lock *lock;
        TDB_DATA data;
 
        if (holding_lock(ctdb_db->ctdb)) {
-               DEBUG(ctdb_db->ctdb, LOG_ERR,
+               DEBUG(ctdb_db->ctdb, LOG_ALERT,
                      "ctdb_readrecordlock_async: already holding lock");
                return false;
        }
@@ -763,6 +887,7 @@ ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
        lock->ctdb_db = ctdb_db;
        lock->hdr = NULL;
        lock->held_magic = 0;
+       lock->readonly = readonly;
 
        /* Fast path. */
        if (try_readrecordlock(lock, &data)) {
@@ -771,8 +896,10 @@ ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
        }
 
        /* Slow path: create request. */
-       req = new_ctdb_request(offsetof(struct ctdb_req_call, data)
-                              + key.dsize, readrecordlock_retry, cbdata);
+       req = new_ctdb_request(
+               ctdb_db->ctdb,
+               offsetof(struct ctdb_req_call, data) + key.dsize,
+               readrecordlock_retry, cbdata);
        if (!req) {
                DEBUG(ctdb_db->ctdb, LOG_ERR,
                      "ctdb_readrecordlock_async: allocation failed");
@@ -787,9 +914,13 @@ ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
        io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
                                new_reqid(ctdb_db->ctdb));
 
-       req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
+       if (lock->readonly) {
+               req->hdr.call->flags = CTDB_WANT_READONLY;
+       } else {
+               req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
+       }
        req->hdr.call->db_id = ctdb_db->id;
-       req->hdr.call->callid = CTDB_NULL_FUNC;
+       req->hdr.call->callid = CTDB_FETCH_WITH_HEADER_FUNC;
        req->hdr.call->hopcount = 0;
        req->hdr.call->keylen = key.dsize;
        req->hdr.call->calldatalen = 0;
@@ -798,22 +929,281 @@ ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
        return true;
 }
 
-int ctdb_writerecord(struct ctdb_lock *lock, TDB_DATA data)
+bool
+ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
+                         ctdb_rrl_callback_t callback, void *cbdata)
+{
+       return ctdb_readrecordlock_internal(ctdb_db, key,
+                       false,
+                       callback, cbdata);
+}
+
+bool
+ctdb_readonlyrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
+                         ctdb_rrl_callback_t callback, void *cbdata)
+{
+       return ctdb_readrecordlock_internal(ctdb_db, key,
+                       true,
+                       callback, cbdata);
+}
+
+bool ctdb_writerecord(struct ctdb_db *ctdb_db,
+                     struct ctdb_lock *lock, TDB_DATA data)
 {
+       if (lock->ctdb_db != ctdb_db) {
+               errno = EBADF;
+               DEBUG(ctdb_db->ctdb, LOG_ALERT,
+                     "ctdb_writerecord: Can not write, wrong ctdb_db.");
+               return false;
+       }
+
        if (lock->held_magic != lock_magic(lock)) {
                errno = EBADF;
-               DEBUG(lock->ctdb_db->ctdb, LOG_ERR,
+               DEBUG(ctdb_db->ctdb, LOG_ALERT,
                      "ctdb_writerecord: Can not write. Lock has been released.");
-               return -1;
+               return false;
        }
                
-       if (lock->ctdb_db->persistent) {
+       if (ctdb_db->persistent) {
                errno = EINVAL;
-               DEBUG(lock->ctdb_db->ctdb, LOG_ERR,
+               DEBUG(ctdb_db->ctdb, LOG_ALERT,
                      "ctdb_writerecord: cannot write to persistent db");
-               return -1;
+               return false;
+       }
+
+       switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
+       case 0:
+               DEBUG(ctdb_db->ctdb, LOG_DEBUG,
+                     "ctdb_writerecord: optimized away noop write.");
+               /* fall thru */
+       case 1:
+               return true;
+
+       default:
+               switch (errno) {
+               case ENOMEM:
+                       DEBUG(ctdb_db->ctdb, LOG_CRIT,
+                             "ctdb_writerecord: out of memory.");
+                       break;
+               case EINVAL:
+                       DEBUG(ctdb_db->ctdb, LOG_ALERT,
+                             "ctdb_writerecord: record changed under lock?");
+                       break;
+               default: /* TDB already logged. */
+                       break;
+               }
+               return false;
+       }
+}
+
+
+struct ctdb_traverse_state {
+       struct ctdb_request *handle;
+       struct ctdb_db *ctdb_db;
+       uint64_t srvid;
+
+       ctdb_traverse_callback_t callback;
+       void *cbdata;
+};
+
+static void traverse_remhnd_cb(struct ctdb_connection *ctdb,
+                        struct ctdb_request *req, void *private_data)
+{
+       struct ctdb_traverse_state *state = private_data;
+
+       if (!ctdb_remove_message_handler_recv(ctdb, state->handle)) {
+               DEBUG(ctdb, LOG_ERR,
+                               "Failed to remove message handler for"
+                               " traverse.");
+               state->callback(state->ctdb_db->ctdb, state->ctdb_db,
+                               TRAVERSE_STATUS_ERROR,
+                               tdb_null, tdb_null,
+                               state->cbdata);
        }
+       ctdb_request_free(state->handle);
+       state->handle = NULL;
+       free(state);
+}
+       
+static void msg_h(struct ctdb_connection *ctdb, uint64_t srvid,
+          TDB_DATA data, void *private_data)
+{
+       struct ctdb_traverse_state *state = private_data;
+       struct ctdb_db *ctdb_db = state->ctdb_db;
+       struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
+       TDB_DATA key;
+
+       if (data.dsize < sizeof(uint32_t) ||
+           d->length != data.dsize) {
+               DEBUG(ctdb, LOG_ERR,
+                       "Bad data size %u in traverse_handler",
+                       (unsigned)data.dsize);
+               state->callback(state->ctdb_db->ctdb, state->ctdb_db,
+                               TRAVERSE_STATUS_ERROR,
+                               tdb_null, tdb_null,
+                               state->cbdata);
+               state->handle = ctdb_remove_message_handler_send(
+                               state->ctdb_db->ctdb, state->srvid,
+                               msg_h, state,
+                               traverse_remhnd_cb, state);
+               return;
+       }
+
+       key.dsize = d->keylen;
+       key.dptr  = &d->data[0];
+       data.dsize = d->datalen;
+       data.dptr = &d->data[d->keylen];
+
+       if (key.dsize == 0 && data.dsize == 0) {
+               state->callback(state->ctdb_db->ctdb, state->ctdb_db,
+                               TRAVERSE_STATUS_FINISHED,
+                               tdb_null, tdb_null,
+                               state->cbdata);
+               state->handle = ctdb_remove_message_handler_send(
+                               state->ctdb_db->ctdb, state->srvid,
+                               msg_h, state,
+                               traverse_remhnd_cb, state);
+               return;
+       }
+
+       if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
+               /* empty records are deleted records in ctdb */
+               return;
+       }
+
+       data.dsize -= sizeof(struct ctdb_ltdb_header);
+       data.dptr  += sizeof(struct ctdb_ltdb_header);
+
+       if (state->callback(ctdb, ctdb_db,
+                       TRAVERSE_STATUS_RECORD,
+                       key, data, state->cbdata) != 0) {
+               state->handle = ctdb_remove_message_handler_send(
+                               state->ctdb_db->ctdb, state->srvid,
+                               msg_h, state,
+                               traverse_remhnd_cb, state);
+               return;
+       }
+}
+
+static void traverse_start_cb(struct ctdb_connection *ctdb,
+                        struct ctdb_request *req, void *private_data)
+{
+       struct ctdb_traverse_state *state = private_data;
 
-       return ctdb_local_store(lock->ctdb_db->tdb, lock->key, lock->hdr,
-                               data);
+        ctdb_request_free(state->handle);
+       state->handle = NULL;
 }
+
+static void traverse_msghnd_cb(struct ctdb_connection *ctdb,
+                        struct ctdb_request *req, void *private_data)
+{
+       struct ctdb_traverse_state *state = private_data;
+       struct ctdb_db *ctdb_db = state->ctdb_db;
+       struct ctdb_traverse_start t;
+
+       if (!ctdb_set_message_handler_recv(ctdb, state->handle)) {
+               DEBUG(ctdb, LOG_ERR,
+                               "Failed to register message handler for"
+                               " traverse.");
+               state->callback(state->ctdb_db->ctdb, state->ctdb_db,
+                               TRAVERSE_STATUS_ERROR,
+                               tdb_null, tdb_null,
+                               state->cbdata);
+               ctdb_request_free(state->handle);
+               state->handle = NULL;
+               free(state);
+               return;
+        }
+        ctdb_request_free(state->handle);
+       state->handle = NULL;
+
+       t.db_id = ctdb_db->id;
+       t.srvid = state->srvid;
+       t.reqid = 0;
+
+       state->handle = new_ctdb_control_request(ctdb,
+                               CTDB_CONTROL_TRAVERSE_START,
+                               CTDB_CURRENT_NODE,
+                               &t, sizeof(t),
+                               traverse_start_cb, state);
+       if (state->handle == NULL) {
+               DEBUG(ctdb, LOG_ERR,
+                               "ctdb_traverse_async:"
+                               " failed to send traverse_start control");
+               state->callback(state->ctdb_db->ctdb, state->ctdb_db,
+                               TRAVERSE_STATUS_ERROR,
+                               tdb_null, tdb_null,
+                               state->cbdata);
+               state->handle = ctdb_remove_message_handler_send(
+                               state->ctdb_db->ctdb, state->srvid,
+                               msg_h, state,
+                               traverse_remhnd_cb, state);
+               return;
+       }
+}
+
+bool ctdb_traverse_async(struct ctdb_db *ctdb_db,
+                        ctdb_traverse_callback_t callback, void *cbdata)
+{
+       struct ctdb_connection *ctdb = ctdb_db->ctdb;
+       struct ctdb_traverse_state *state;
+       static uint32_t tid = 0;
+
+       state = malloc(sizeof(struct ctdb_traverse_state));
+       if (state == NULL) {
+               DEBUG(ctdb, LOG_ERR,
+                               "ctdb_traverse_async: no memory."
+                               " allocate state failed");
+               return false;
+       }
+
+       tid++;
+       state->srvid = CTDB_SRVID_TRAVERSE_RANGE|tid;
+
+       state->callback = callback;
+       state->cbdata   = cbdata;
+       state->ctdb_db  = ctdb_db;
+
+       state->handle = ctdb_set_message_handler_send(ctdb_db->ctdb,
+                               state->srvid,
+                               msg_h, state,
+                               traverse_msghnd_cb, state);
+       if (state->handle == NULL) {
+               DEBUG(ctdb, LOG_ERR,
+                       "ctdb_traverse_async:"
+                       " failed ctdb_set_message_handler_send");
+               free(state);
+               return false;
+       }
+
+       return true;
+}
+
+int ctdb_num_out_queue(struct ctdb_connection *ctdb)
+{
+       struct ctdb_request *req;
+       int i;
+
+       for (i = 0, req = ctdb->outq; req; req = req->next, i++)
+               ;
+
+       return i;
+}
+
+int ctdb_num_in_flight(struct ctdb_connection *ctdb)
+{
+       struct ctdb_request *req;
+       int i;
+
+       for (i = 0, req = ctdb->doneq; req; req = req->next, i++)
+               ;
+
+       return i;
+}
+
+int ctdb_num_active(struct ctdb_connection *ctdb)
+{
+       return ctdb_num_out_queue(ctdb)
+                + ctdb_num_in_flight(ctdb);
+}
+