Merge branch 'master-readonly-records' into foo
[sahlberg/ctdb.git] / client / ctdb_client.c
index 96214353988cc234e2ff850ff8e513ada562c671..89eeb4836a14bca81a422765b558854f1fb777e0 100644 (file)
@@ -22,7 +22,7 @@
 #include "db_wrap.h"
 #include "lib/tdb/include/tdb.h"
 #include "lib/util/dlinklist.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
 #include "system/network.h"
 #include "system/filesys.h"
 #include "system/locale.h"
@@ -30,6 +30,8 @@
 #include "../include/ctdb_private.h"
 #include "lib/util/dlinklist.h"
 
+pid_t ctdbd_pid;
+
 /*
   allocate a packet for use in client<->daemon communication
  */
@@ -70,7 +72,7 @@ struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
 */
 int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
                    struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
-                   TDB_DATA *data, uint32_t caller)
+                   TDB_DATA *data, bool updatetdb)
 {
        struct ctdb_call_info *c;
        struct ctdb_registered_call *fn;
@@ -87,6 +89,7 @@ int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
        c->new_data = NULL;
        c->reply_data = NULL;
        c->status = 0;
+       c->header = header;
 
        for (fn=ctdb_db->calls;fn;fn=fn->next) {
                if (fn->id == call->call_id) break;
@@ -103,19 +106,12 @@ int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
                return -1;
        }
 
-       if (header->laccessor != caller) {
-               header->lacount = 0;
-       }
-       header->laccessor = caller;
-       header->lacount++;
-
-       /* we need to force the record to be written out if this was a remote access,
-          so that the lacount is updated */
-       if (c->new_data == NULL && header->laccessor != ctdb->pnn) {
+       /* we need to force the record to be written out if this was a remote access */
+       if (c->new_data == NULL) {
                c->new_data = &c->record_data;
        }
 
-       if (c->new_data) {
+       if (c->new_data && updatetdb) {
                /* XXX check that we always have the lock here? */
                if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) {
                        ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n");
@@ -191,7 +187,7 @@ static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req
 /*
   this is called in the client, when data comes in from the daemon
  */
-static void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args)
+void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args)
 {
        struct ctdb_context *ctdb = talloc_get_type(args, struct ctdb_context);
        struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
@@ -251,16 +247,41 @@ done:
 }
 
 /*
-  connect to a unix domain socket
+  connect with exponential backoff, thanks Stevens
 */
-int ctdb_socket_connect(struct ctdb_context *ctdb)
+#define CONNECT_MAXSLEEP 64
+static int ctdb_connect_retry(struct ctdb_context *ctdb)
 {
        struct sockaddr_un addr;
+       int secs;
+       int ret = 0;
 
        memset(&addr, 0, sizeof(addr));
        addr.sun_family = AF_UNIX;
        strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path));
 
+       for (secs = 1; secs <= CONNECT_MAXSLEEP; secs *= 2) {
+               ret = connect(ctdb->daemon.sd, (struct sockaddr *)&addr,
+                             sizeof(addr));
+               if ((ret == 0) || (errno != EAGAIN)) {
+                       break;
+               }
+
+               if (secs <= (CONNECT_MAXSLEEP / 2)) {
+                       DEBUG(DEBUG_ERR,("connect failed: %s, retry in %d second(s)\n",
+                                        strerror(errno), secs));
+                       sleep(secs);
+               }
+       }
+
+       return ret;
+}
+
+/*
+  connect to a unix domain socket
+*/
+int ctdb_socket_connect(struct ctdb_context *ctdb)
+{
        ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (ctdb->daemon.sd == -1) {
                DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno));
@@ -269,17 +290,17 @@ int ctdb_socket_connect(struct ctdb_context *ctdb)
 
        set_nonblocking(ctdb->daemon.sd);
        set_close_on_exec(ctdb->daemon.sd);
-       
-       if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+
+       if (ctdb_connect_retry(ctdb) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno));
                close(ctdb->daemon.sd);
                ctdb->daemon.sd = -1;
-               DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno));
                return -1;
        }
 
        ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd, 
                                              CTDB_DS_ALIGNMENT, 
-                                             ctdb_client_read_cb, ctdb);
+                                             ctdb_client_read_cb, ctdb, "to-ctdbd");
        return 0;
 }
 
@@ -325,7 +346,7 @@ int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
        call->status = state->call->status;
        talloc_free(state);
 
-       return 0;
+       return call->status;
 }
 
 
@@ -366,7 +387,7 @@ static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db
        *(state->call) = *call;
        state->ctdb_db = ctdb_db;
 
-       ret = ctdb_call_local(ctdb_db, state->call, header, state, data, ctdb->pnn);
+       ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
 
        return state;
 }
@@ -401,6 +422,10 @@ struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
 
        ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data);
 
+       if ((call->flags & CTDB_IMMEDIATE_MIGRATION) && (header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+               ret = -1;
+       }
+
        if (ret == 0 && header.dmaster == ctdb->pnn) {
                state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
                talloc_free(data.dptr);
@@ -472,8 +497,8 @@ int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
   tell the daemon what messaging srvid we will use, and register the message
   handler function in the client
 */
-int ctdb_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid, 
-                            ctdb_message_fn_t handler,
+int ctdb_client_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            ctdb_msg_fn_t handler,
                             void *private_data)
                                    
 {
@@ -494,7 +519,7 @@ int ctdb_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
 /*
   tell the daemon we no longer want a srvid
 */
-int ctdb_remove_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data)
+int ctdb_client_remove_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data)
 {
        int res;
        int32_t status;
@@ -515,7 +540,7 @@ int ctdb_remove_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void
 /*
   send a message - from client context
  */
-int ctdb_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn,
                      uint64_t srvid, TDB_DATA data)
 {
        struct ctdb_req_message *r;
@@ -563,6 +588,48 @@ static int ctdb_client_force_migration(struct ctdb_db_context *ctdb_db, TDB_DATA
        return ctdb_call(ctdb_db, &call);
 }
 
+/*
+  try to fetch a readonly copy of a record
+ */
+static int
+ctdb_client_fetch_readonly(struct ctdb_db_context *ctdb_db, TDB_DATA key, TALLOC_CTX *mem_ctx, struct ctdb_ltdb_header **hdr, TDB_DATA *data)
+{
+       int ret;
+
+       struct ctdb_call call;
+       ZERO_STRUCT(call);
+
+       call.call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+       call.call_data.dptr = NULL;
+       call.call_data.dsize = 0;
+       call.key = key;
+       call.flags = CTDB_WANT_READONLY;
+       ret = ctdb_call(ctdb_db, &call);
+
+       if (ret != 0) {
+               return -1;
+       }
+       if (call.reply_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               return -1;
+       }
+
+       *hdr = talloc_memdup(mem_ctx, &call.reply_data.dptr[0], sizeof(struct ctdb_ltdb_header));
+       if (*hdr == NULL) {
+               talloc_free(call.reply_data.dptr);
+               return -1;
+       }
+
+       data->dsize = call.reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+       data->dptr  = talloc_memdup(mem_ctx, &call.reply_data.dptr[sizeof(struct ctdb_ltdb_header)], data->dsize);
+       if (data->dptr == NULL) {
+               talloc_free(call.reply_data.dptr);
+               talloc_free(hdr);
+               return -1;
+       }
+
+       return 0;
+}
+
 /*
   get a lock on a record, and return the records data. Blocks until it gets the lock
  */
@@ -639,6 +706,185 @@ again:
        return h;
 }
 
+/*
+  get a readonly lock on a record, and return the records data. Blocks until it gets the lock
+ */
+struct ctdb_record_handle *
+ctdb_fetch_readonly_lock(
+       struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, 
+       TDB_DATA key, TDB_DATA *data,
+       int read_only)
+{
+       int ret;
+       struct ctdb_record_handle *h;
+       struct ctdb_ltdb_header *roheader = NULL;
+
+       h = talloc_zero(mem_ctx, struct ctdb_record_handle);
+       if (h == NULL) {
+               return NULL;
+       }
+
+       h->ctdb_db = ctdb_db;
+       h->key     = key;
+       h->key.dptr = talloc_memdup(h, key.dptr, key.dsize);
+       if (h->key.dptr == NULL) {
+               talloc_free(h);
+               return NULL;
+       }
+       h->data    = data;
+
+       data->dptr = NULL;
+       data->dsize = 0;
+
+
+again:
+       talloc_free(roheader);
+       roheader = NULL;
+
+       talloc_free(data->dptr);
+       data->dptr = NULL;
+       data->dsize = 0;
+
+       /* Lock the record/chain */
+       ret = ctdb_ltdb_lock(ctdb_db, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       talloc_set_destructor(h, fetch_lock_destructor);
+
+       /* Check if record exists yet in the TDB */
+       ret = ctdb_ltdb_fetch_readonly(ctdb_db, key, &h->header, h, data);
+       if (ret != 0) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_force_migration(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+               goto again;
+       }
+
+       /* if this is a request for read/write and we have delegations
+          we have to revoke all delegations first
+       */
+       if ((read_only == 0) 
+       &&  (h->header.dmaster == ctdb_db->ctdb->pnn)
+       &&  (h->header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_force_migration(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+               goto again;
+       }
+
+       /* if we are dmaster, just return the handle */
+       if (h->header.dmaster == ctdb_db->ctdb->pnn) {
+               return h;
+       }
+
+       if (read_only != 0) {
+               TDB_DATA rodata = {NULL, 0};
+
+               if ((h->header.flags & CTDB_REC_RO_HAVE_READONLY)
+               ||  (h->header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+                       return h;
+               }
+
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_fetch_readonly(ctdb_db, key, h, &roheader, &rodata);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("ctdb_fetch_readonly_lock:  failed. force migration and try again\n"));
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               if (!(roheader->flags&CTDB_REC_RO_HAVE_READONLY)) {
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               ret = ctdb_ltdb_lock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+
+               ret = ctdb_ltdb_fetch_readonly(ctdb_db, key, &h->header, h, data);
+               if (ret != 0) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               if (h->header.rsn >= roheader->rsn) {
+                       DEBUG(DEBUG_ERR,("READONLY RECORD: Too small RSN, migrate and try again\n"));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               if (ctdb_ltdb_store(ctdb_db, key, roheader, rodata) != 0) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+               return h;
+       }
+
+       /* we are not dmaster and this was not a request for a readonly lock
+        * so unlock the record, migrate it and try again
+        */
+       ctdb_ltdb_unlock(ctdb_db, key);
+       ret = ctdb_client_force_migration(ctdb_db, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: force_migration failed\n"));
+               talloc_free(h);
+               return NULL;
+       }
+       goto again;
+}
+
 /*
   store some data to the record that was locked with ctdb_fetch_lock()
 */
@@ -664,6 +910,7 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
        call.call_id = CTDB_FETCH_FUNC;
        call.call_data.dptr = NULL;
        call.call_data.dsize = 0;
+       call.key = key;
 
        ret = ctdb_call(ctdb_db, &call);
 
@@ -764,7 +1011,9 @@ static void control_timeout_func(struct event_context *ev, struct timed_event *t
 {
        struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
 
-       DEBUG(DEBUG_ERR,("control timed out. reqid:%d opcode:%d dstnode:%d\n", state->reqid, state->c->opcode, state->c->hdr.destnode));
+       DEBUG(DEBUG_ERR,(__location__ " control timed out. reqid:%u opcode:%u "
+                        "dstnode:%u\n", state->reqid, state->c->opcode,
+                        state->c->hdr.destnode));
 
        state->state = CTDB_CONTROL_TIMEOUT;
 
@@ -1493,6 +1742,44 @@ int ctdb_ctrl_getdbname(struct ctdb_context *ctdb, struct timeval timeout, uint3
        return 0;
 }
 
+/*
+  get the health status of a db
+ */
+int ctdb_ctrl_getdbhealth(struct ctdb_context *ctdb,
+                         struct timeval timeout,
+                         uint32_t destnode,
+                         uint32_t dbid, TALLOC_CTX *mem_ctx,
+                         const char **reason)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_DB_GET_HEALTH, 0, data,
+                          mem_ctx, &data, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       if (data.dsize == 0) {
+               (*reason) = NULL;
+               return 0;
+       }
+
+       (*reason) = talloc_strndup(mem_ctx, (const char *)data.dptr, data.dsize);
+       if ((*reason) == NULL) {
+               return -1;
+       }
+
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
 /*
   create a database
  */
@@ -1633,10 +1920,35 @@ static int ctdb_fetch_func(struct ctdb_call_info *call)
        return 0;
 }
 
+/*
+  this is a plain fetch procedure that all databases support
+  this returns the full record including the ltdb header
+*/
+static int ctdb_fetch_with_header_func(struct ctdb_call_info *call)
+{
+       call->reply_data = talloc(call, TDB_DATA);
+       if (call->reply_data == NULL) {
+               return -1;
+       }
+       call->reply_data->dsize = sizeof(struct ctdb_ltdb_header) + call->record_data.dsize;
+       call->reply_data->dptr  = talloc_size(call->reply_data, call->reply_data->dsize);
+       if (call->reply_data->dptr == NULL) {
+               return -1;
+       }
+       memcpy(call->reply_data->dptr, call->header, sizeof(struct ctdb_ltdb_header));
+       memcpy(&call->reply_data->dptr[sizeof(struct ctdb_ltdb_header)], call->record_data.dptr, call->record_data.dsize);
+
+       return 0;
+}
+
 /*
   attach to a specific database - client call
 */
-struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name, bool persistent, uint32_t tdb_flags)
+struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb,
+                                   struct timeval timeout,
+                                   const char *name,
+                                   bool persistent,
+                                   uint32_t tdb_flags)
 {
        struct ctdb_db_context *ctdb_db;
        TDB_DATA data;
@@ -1671,7 +1983,7 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
        ctdb_db->db_id = *(uint32_t *)data.dptr;
        talloc_free(data.dptr);
 
-       ret = ctdb_ctrl_getdbpath(ctdb, timeval_current_ofs(2, 0), CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
+       ret = ctdb_ctrl_getdbpath(ctdb, timeout, CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
        if (ret != 0) {
                DEBUG(DEBUG_ERR,("Failed to get dbpath for database '%s'\n", name));
                talloc_free(ctdb_db);
@@ -1679,9 +1991,10 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
        }
 
        tdb_flags = persistent?TDB_DEFAULT:TDB_NOSYNC;
-       if (!ctdb->do_setsched) {
+       if (ctdb->valgrinding) {
                tdb_flags |= TDB_NOMMAP;
        }
+       tdb_flags |= TDB_DISALLOW_NESTING;
 
        ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 0, tdb_flags, O_RDWR, 0);
        if (ctdb_db->ltdb == NULL) {
@@ -1697,6 +2010,7 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
        /* add well known functions */
        ctdb_set_call(ctdb_db, ctdb_null_func, CTDB_NULL_FUNC);
        ctdb_set_call(ctdb_db, ctdb_fetch_func, CTDB_FETCH_FUNC);
+       ctdb_set_call(ctdb_db, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
 
        return ctdb_db;
 }
@@ -1806,7 +2120,7 @@ int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *
        state.private_data = private_data;
        state.fn = fn;
 
-       ret = ctdb_set_message_handler(ctdb_db->ctdb, srvid, traverse_handler, &state);
+       ret = ctdb_client_set_message_handler(ctdb_db->ctdb, srvid, traverse_handler, &state);
        if (ret != 0) {
                DEBUG(DEBUG_ERR,("Failed to setup traverse handler\n"));
                return -1;
@@ -1823,7 +2137,7 @@ int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *
                           data, NULL, NULL, &status, NULL, NULL);
        if (ret != 0 || status != 0) {
                DEBUG(DEBUG_ERR,("ctdb_traverse_all failed\n"));
-               ctdb_remove_message_handler(ctdb_db->ctdb, srvid, &state);
+               ctdb_client_remove_message_handler(ctdb_db->ctdb, srvid, &state);
                return -1;
        }
 
@@ -1831,7 +2145,7 @@ int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *
                event_loop_once(ctdb_db->ctdb->ev);
        }
 
-       ret = ctdb_remove_message_handler(ctdb_db->ctdb, srvid, &state);
+       ret = ctdb_client_remove_message_handler(ctdb_db->ctdb, srvid, &state);
        if (ret != 0) {
                DEBUG(DEBUG_ERR,("Failed to remove ctdb_traverse handler\n"));
                return -1;
@@ -1840,19 +2154,16 @@ int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *
        return state.count;
 }
 
-#define ISASCII(x) ((x>31)&&(x<128))
+#define ISASCII(x) (isprint(x) && !strchr("\"\\", (x)))
 /*
   called on each key during a catdb
  */
-static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *p)
+int ctdb_dumpdb_record(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *p)
 {
        int i;
        FILE *f = (FILE *)p;
        struct ctdb_ltdb_header *h = (struct ctdb_ltdb_header *)data.dptr;
 
-       fprintf(f, "dmaster: %u\n", h->dmaster);
-       fprintf(f, "rsn: %llu\n", (unsigned long long)h->rsn);
-
        fprintf(f, "key(%u) = \"", (unsigned)key.dsize);
        for (i=0;i<key.dsize;i++) {
                if (ISASCII(key.dptr[i])) {
@@ -1863,7 +2174,19 @@ static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, voi
        }
        fprintf(f, "\"\n");
 
-       fprintf(f, "data(%u) = \"", (unsigned)data.dsize);
+       fprintf(f, "dmaster: %u\n", h->dmaster);
+       fprintf(f, "rsn: %llu\n", (unsigned long long)h->rsn);
+       fprintf(f, "flags: 0x%08x", h->flags);
+       if (h->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) printf(" MIGRATED_WITH_DATA");
+       if (h->flags & CTDB_REC_FLAG_VACUUM_MIGRATED) printf(" VACUUM_MIGRATED");
+       if (h->flags & CTDB_REC_FLAG_AUTOMATIC) printf(" AUTOMATIC");
+       if (h->flags & CTDB_REC_RO_HAVE_DELEGATIONS) printf(" RO_HAVE_DELEGATIONS");
+       if (h->flags & CTDB_REC_RO_HAVE_READONLY) printf(" RO_HAVE_READONLY");
+       if (h->flags & CTDB_REC_RO_REVOKING_READONLY) printf(" RO_REVOKING_READONLY");
+       if (h->flags & CTDB_REC_RO_REVOKE_COMPLETE) printf(" RO_REVOKE_COMPLETE");
+       fprintf(f, "\n");
+
+       fprintf(f, "data(%u) = \"", (unsigned)(data.dsize - sizeof(*h)));
        for (i=sizeof(*h);i<data.dsize;i++) {
                if (ISASCII(data.dptr[i])) {
                        fprintf(f, "%c", data.dptr[i]);
@@ -1873,6 +2196,8 @@ static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, voi
        }
        fprintf(f, "\"\n");
 
+       fprintf(f, "\n");
+
        return 0;
 }
 
@@ -1881,7 +2206,7 @@ static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, voi
  */
 int ctdb_dump_db(struct ctdb_db_context *ctdb_db, FILE *f)
 {
-       return ctdb_traverse(ctdb_db, dumpdb_fn, f);
+       return ctdb_traverse(ctdb_db, ctdb_dumpdb_record, f);
 }
 
 /*
@@ -1910,9 +2235,9 @@ int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t
   async freeze send control
  */
 struct ctdb_client_control_state *
-ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t priority)
 {
-       return ctdb_control_send(ctdb, destnode, 0
+       return ctdb_control_send(ctdb, destnode, priority
                           CTDB_CONTROL_FREEZE, 0, tdb_null, 
                           mem_ctx, &timeout, NULL);
 }
@@ -1935,30 +2260,43 @@ int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct
 }
 
 /*
-  freeze a node
+  freeze databases of a certain priority
  */
-int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+int ctdb_ctrl_freeze_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority)
 {
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
        struct ctdb_client_control_state *state;
        int ret;
 
-       state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode);
+       state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode, priority);
        ret = ctdb_ctrl_freeze_recv(ctdb, tmp_ctx, state);
        talloc_free(tmp_ctx);
 
        return ret;
 }
 
+/* Freeze all databases */
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int i;
+
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb_ctrl_freeze_priority(ctdb, timeout, destnode, i) != 0) {
+                       return -1;
+               }
+       }
+       return 0;
+}
+
 /*
-  thaw a node
+  thaw databases of a certain priority
  */
-int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+int ctdb_ctrl_thaw_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority)
 {
        int ret;
        int32_t res;
 
-       ret = ctdb_control(ctdb, destnode, 0
+       ret = ctdb_control(ctdb, destnode, priority
                           CTDB_CONTROL_THAW, 0, tdb_null, 
                           NULL, NULL, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
@@ -1969,6 +2307,12 @@ int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t d
        return 0;
 }
 
+/* thaw all databases */
+int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_ctrl_thaw_priority(ctdb, timeout, destnode, 0);
+}
+
 /*
   get pnn of a node, or -1
  */
@@ -2254,16 +2598,18 @@ int ctdb_ctrl_list_tunables(struct ctdb_context *ctdb,
 }
 
 
-int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb, 
-                       struct timeval timeout, uint32_t destnode, 
-                       TALLOC_CTX *mem_ctx, struct ctdb_all_public_ips **ips)
+int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb,
+                                  struct timeval timeout, uint32_t destnode,
+                                  TALLOC_CTX *mem_ctx,
+                                  uint32_t flags,
+                                  struct ctdb_all_public_ips **ips)
 {
        int ret;
        TDB_DATA outdata;
        int32_t res;
 
        ret = ctdb_control(ctdb, destnode, 0, 
-                          CTDB_CONTROL_GET_PUBLIC_IPS, 0, tdb_null, 
+                          CTDB_CONTROL_GET_PUBLIC_IPS, flags, tdb_null,
                           mem_ctx, &outdata, &res, &timeout, NULL);
        if (ret == 0 && res == -1) {
                DEBUG(DEBUG_ERR,(__location__ " ctdb_control to get public ips failed, falling back to ipv4-only version\n"));
@@ -2280,6 +2626,16 @@ int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
        return 0;
 }
 
+int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            struct ctdb_all_public_ips **ips)
+{
+       return ctdb_ctrl_get_public_ips_flags(ctdb, timeout,
+                                             destnode, mem_ctx,
+                                             0, ips);
+}
+
 int ctdb_ctrl_get_public_ipsv4(struct ctdb_context *ctdb, 
                        struct timeval timeout, uint32_t destnode, 
                        TALLOC_CTX *mem_ctx, struct ctdb_all_public_ips **ips)
@@ -2313,44 +2669,200 @@ int ctdb_ctrl_get_public_ipsv4(struct ctdb_context *ctdb,
        return 0;
 }
 
-/*
-  set/clear the permanent disabled bit on a remote node
- */
-int ctdb_ctrl_modflags(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
-                      uint32_t set, uint32_t clear)
+int ctdb_ctrl_get_public_ip_info(struct ctdb_context *ctdb,
+                                struct timeval timeout, uint32_t destnode,
+                                TALLOC_CTX *mem_ctx,
+                                const ctdb_sock_addr *addr,
+                                struct ctdb_control_public_ip_info **_info)
 {
        int ret;
-       TDB_DATA data;
-       struct ctdb_node_map *nodemap=NULL;
-       struct ctdb_node_flag_change c;
-       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
-       uint32_t recmaster;
-       uint32_t *nodes;
+       TDB_DATA indata;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_control_public_ip_info *info;
+       uint32_t len;
+       uint32_t i;
 
+       indata.dptr = discard_const_p(uint8_t, addr);
+       indata.dsize = sizeof(*addr);
 
-       /* find the recovery master */
-       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, timeout, CTDB_CURRENT_NODE, &recmaster);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
-               talloc_free(tmp_ctx);
-               return ret;
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_GET_PUBLIC_IP_INFO, 0, indata,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
        }
 
-
-       /* read the node flags from the recmaster */
-       ret = ctdb_ctrl_getnodemap(ctdb, timeout, recmaster, tmp_ctx, &nodemap);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", destnode));
-               talloc_free(tmp_ctx);
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces);
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
                return -1;
        }
-       if (destnode >= nodemap->num) {
-               DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", destnode));
-               talloc_free(tmp_ctx);
+
+       info = (struct ctdb_control_public_ip_info *)outdata.dptr;
+       len += info->num*sizeof(struct ctdb_control_iface_info);
+
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
                return -1;
        }
 
-       c.pnn       = destnode;
+       /* make sure we null terminate the returned strings */
+       for (i=0; i < info->num; i++) {
+               info->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+       }
+
+       *_info = (struct ctdb_control_public_ip_info *)talloc_memdup(mem_ctx,
+                                                               outdata.dptr,
+                                                               outdata.dsize);
+       talloc_free(outdata.dptr);
+       if (*_info == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "talloc_memdup size %u failed\n",
+                               (unsigned int)outdata.dsize));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb,
+                        struct timeval timeout, uint32_t destnode,
+                        TALLOC_CTX *mem_ctx,
+                        struct ctdb_control_get_ifaces **_ifaces)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_control_get_ifaces *ifaces;
+       uint32_t len;
+       uint32_t i;
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_GET_IFACES, 0, tdb_null,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces);
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       ifaces = (struct ctdb_control_get_ifaces *)outdata.dptr;
+       len += ifaces->num*sizeof(struct ctdb_control_iface_info);
+
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       /* make sure we null terminate the returned strings */
+       for (i=0; i < ifaces->num; i++) {
+               ifaces->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+       }
+
+       *_ifaces = (struct ctdb_control_get_ifaces *)talloc_memdup(mem_ctx,
+                                                                 outdata.dptr,
+                                                                 outdata.dsize);
+       talloc_free(outdata.dptr);
+       if (*_ifaces == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "talloc_memdup size %u failed\n",
+                               (unsigned int)outdata.dsize));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_set_iface_link(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            const struct ctdb_control_iface_info *info)
+{
+       int ret;
+       TDB_DATA indata;
+       int32_t res;
+
+       indata.dptr = discard_const_p(uint8_t, info);
+       indata.dsize = sizeof(*info);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_SET_IFACE_LINK_STATE, 0, indata,
+                          mem_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set iface link "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set/clear the permanent disabled bit on a remote node
+ */
+int ctdb_ctrl_modflags(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
+                      uint32_t set, uint32_t clear)
+{
+       int ret;
+       TDB_DATA data;
+       struct ctdb_node_map *nodemap=NULL;
+       struct ctdb_node_flag_change c;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t recmaster;
+       uint32_t *nodes;
+
+
+       /* find the recovery master */
+       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, timeout, CTDB_CURRENT_NODE, &recmaster);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+
+       /* read the node flags from the recmaster */
+       ret = ctdb_ctrl_getnodemap(ctdb, timeout, recmaster, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", destnode));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (destnode >= nodemap->num) {
+               DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", destnode));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       c.pnn       = destnode;
        c.old_flags = nodemap->nodes[destnode].flags;
        c.new_flags = c.old_flags;
        c.new_flags |= set;
@@ -2363,11 +2875,11 @@ int ctdb_ctrl_modflags(struct ctdb_context *ctdb, struct timeval timeout, uint32
        nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
-                                       nodes,
+                                       nodes, 0,
                                        timeout, false, data,
                                        NULL, NULL,
                                        NULL) != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " ctdb_control to disable node failed\n"));
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 
                talloc_free(tmp_ctx);
                return -1;
@@ -2684,6 +3196,8 @@ struct ctdb_context *ctdb_init(struct event_context *ev)
        }
        ctdb->ev  = ev;
        ctdb->idr = idr_init(ctdb);
+       /* Wrap early to exercise code. */
+       ctdb->lastid = INT_MAX-200;
        CTDB_NO_MEMORY_NULL(ctdb, ctdb->idr);
 
        ret = ctdb_set_socketname(ctdb, CTDB_PATH);
@@ -2693,6 +3207,8 @@ struct ctdb_context *ctdb_init(struct event_context *ev)
                return NULL;
        }
 
+       ctdb->statistics.statistics_start_time = timeval_current();
+
        return ctdb;
 }
 
@@ -2716,6 +3232,11 @@ int ctdb_set_socketname(struct ctdb_context *ctdb, const char *socketname)
        return 0;
 }
 
+const char *ctdb_get_socketname(struct ctdb_context *ctdb)
+{
+       return ctdb->daemon.name;
+}
+
 /*
   return the pnn of this node
 */
@@ -2868,6 +3389,7 @@ int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *
 int ctdb_client_async_control(struct ctdb_context *ctdb,
                                enum ctdb_controls opcode,
                                uint32_t *nodes,
+                               uint64_t srvid,
                                struct timeval timeout,
                                bool dont_log_errors,
                                TDB_DATA data,
@@ -2893,7 +3415,7 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
        for (j=0; j<num_nodes; j++) {
                uint32_t pnn = nodes[j];
 
-               state = ctdb_control_send(ctdb, pnn, 0, opcode, 
+               state = ctdb_control_send(ctdb, pnn, srvid, opcode, 
                                          0, data, async_data, &timeout, NULL);
                if (state == NULL) {
                        DEBUG(DEBUG_ERR,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
@@ -2975,6 +3497,40 @@ uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
        return nodes;
 }
 
+uint32_t *list_of_active_nodes_except_pnn(struct ctdb_context *ctdb,
+                               struct ctdb_node_map *node_map,
+                               TALLOC_CTX *mem_ctx,
+                               uint32_t pnn)
+{
+       int i, j, num_nodes;
+       uint32_t *nodes;
+
+       for (i=num_nodes=0;i<node_map->num;i++) {
+               if (node_map->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (node_map->nodes[i].pnn == pnn) {
+                       continue;
+               }
+               num_nodes++;
+       } 
+
+       nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+       CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+       for (i=j=0;i<node_map->num;i++) {
+               if (node_map->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (node_map->nodes[i].pnn == pnn) {
+                       continue;
+               }
+               nodes[j++] = node_map->nodes[i].pnn;
+       } 
+
+       return nodes;
+}
+
 uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb,
                                struct ctdb_node_map *node_map,
                                TALLOC_CTX *mem_ctx,
@@ -3085,12 +3641,42 @@ int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout,
        return ret;
 }
 
+/**
+ * check whether a transaction is active on a given db on a given node
+ */
+int32_t ctdb_ctrl_transaction_active(struct ctdb_context *ctdb,
+                                    uint32_t destnode,
+                                    uint32_t db_id)
+{
+       int32_t status;
+       int ret;
+       TDB_DATA indata;
+
+       indata.dptr = (uint8_t *)&db_id;
+       indata.dsize = sizeof(db_id);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_TRANS2_ACTIVE,
+                          0, indata, NULL, NULL, &status,
+                          NULL, NULL);
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb control for transaction_active failed\n"));
+               return -1;
+       }
+
+       return status;
+}
+
+
 struct ctdb_transaction_handle {
        struct ctdb_db_context *ctdb_db;
        bool in_replay;
-       /* we store the reads and writes done under a transaction one
-          list stores both reads and writes, the other just writes
-       */
+       /*
+        * we store the reads and writes done under a transaction:
+        * - one list stores both reads and writes (m_all),
+        * - the other just writes (m_write)
+        */
        struct ctdb_marshall_buffer *m_all;
        struct ctdb_marshall_buffer *m_write;
 };
@@ -3114,6 +3700,7 @@ static int ctdb_transaction_fetch_start(struct ctdb_transaction_handle *h)
        int ret;
        struct ctdb_db_context *ctdb_db = h->ctdb_db;
        pid_t pid;
+       int32_t status;
 
        key.dptr = discard_const(keyname);
        key.dsize = strlen(keyname);
@@ -3128,10 +3715,25 @@ again:
 
        rh = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, NULL);
        if (rh == NULL) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to fetch_lock database\n"));             
+               DEBUG(DEBUG_ERR,(__location__ " Failed to fetch_lock database\n"));
                talloc_free(tmp_ctx);
                return -1;
        }
+
+       status = ctdb_ctrl_transaction_active(ctdb_db->ctdb,
+                                             CTDB_CURRENT_NODE,
+                                             ctdb_db->db_id);
+       if (status == 1) {
+               unsigned long int usec = (1000 + random()) % 100000;
+               DEBUG(DEBUG_DEBUG, (__location__ " transaction is active "
+                                   "on db_id[0x%08x]. waiting for %lu "
+                                   "microseconds\n",
+                                   ctdb_db->db_id, usec));
+               talloc_free(tmp_ctx);
+               usleep(usec);
+               goto again;
+       }
+
        /*
         * store the pid in the database:
         * it is not enough that the node is dmaster...
@@ -3139,6 +3741,8 @@ again:
        pid = getpid();
        data.dptr = (unsigned char *)&pid;
        data.dsize = sizeof(pid_t);
+       rh->header.rsn++;
+       rh->header.dmaster = ctdb_db->ctdb->pnn;
        ret = ctdb_ltdb_store(ctdb_db, key, &(rh->header), data);
        if (ret != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Failed to store pid in "
@@ -3157,13 +3761,25 @@ again:
        }
 
        ret = ctdb_ltdb_fetch(ctdb_db, key, &header, tmp_ctx, &data);
-       if (ret != 0 || header.dmaster != ctdb_db->ctdb->pnn) {
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to re-fetch transaction "
+                                "lock record inside transaction\n"));
+               tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+               talloc_free(tmp_ctx);
+               goto again;
+       }
+
+       if (header.dmaster != ctdb_db->ctdb->pnn) {
+               DEBUG(DEBUG_DEBUG,(__location__ " not dmaster any more on "
+                                  "transaction lock record\n"));
                tdb_transaction_cancel(ctdb_db->ltdb->tdb);
                talloc_free(tmp_ctx);
                goto again;
        }
 
        if ((data.dsize != sizeof(pid_t)) || (*(pid_t *)(data.dptr) != pid)) {
+               DEBUG(DEBUG_DEBUG, (__location__ " my pid is not stored in "
+                                   "the transaction lock record\n"));
                tdb_transaction_cancel(ctdb_db->ltdb->tdb);
                talloc_free(tmp_ctx);
                goto again;
@@ -3397,6 +4013,9 @@ again:
                           &timeout, NULL);
        if (ret != 0 || status != 0) {
                tdb_transaction_cancel(h->ctdb_db->ltdb->tdb);
+               DEBUG(DEBUG_NOTICE, (__location__ " transaction commit%s failed"
+                                    ", retrying after 1 second...\n",
+                                    (retries==0)?"":"retry "));
                sleep(1);
 
                if (ret != 0) {
@@ -3416,7 +4035,7 @@ again:
                        }
                }
 
-               if (++retries == 10) {
+               if (++retries == 100) {
                        DEBUG(DEBUG_ERR,(__location__ " Giving up transaction on db 0x%08x after %d retries failure_control=%u\n", 
                                         h->ctdb_db->db_id, retries, (unsigned)failure_control));
                        ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
@@ -3427,7 +4046,11 @@ again:
                }               
 
                if (ctdb_replay_transaction(h) != 0) {
-                       DEBUG(DEBUG_ERR,(__location__ " Failed to replay transaction\n"));
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to replay "
+                                         "transaction on db 0x%08x, "
+                                         "failure control =%u\n",
+                                         h->ctdb_db->db_id,
+                                         (unsigned)failure_control));
                        ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
                                     failure_control, CTDB_CTRL_FLAG_NOREPLY, 
                                     tdb_null, NULL, NULL, NULL, NULL, NULL);           
@@ -3442,7 +4065,11 @@ again:
        /* do the real commit locally */
        ret = tdb_transaction_commit(h->ctdb_db->ltdb->tdb);
        if (ret != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to commit transaction\n"));
+               DEBUG(DEBUG_ERR, (__location__ " Failed to commit transaction "
+                                 "on db id 0x%08x locally, "
+                                 "failure_control=%u\n",
+                                 h->ctdb_db->db_id,
+                                 (unsigned)failure_control));
                ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
                             failure_control, CTDB_CTRL_FLAG_NOREPLY, 
                             tdb_null, NULL, NULL, NULL, NULL, NULL);           
@@ -3480,9 +4107,15 @@ int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb)
  * to the daemon as a client process, this function can be used to change
  * the ctdb context from daemon into client mode
  */
-int switch_from_server_to_client(struct ctdb_context *ctdb)
+int switch_from_server_to_client(struct ctdb_context *ctdb, const char *fmt, ...)
 {
        int ret;
+       va_list ap;
+
+       /* Add extra information so we can identify this in the logs */
+       va_start(ap, fmt);
+       debug_extra = talloc_strdup_append(talloc_vasprintf(NULL, fmt, ap), ":");
+       va_end(ap);
 
        /* shutdown the transport */
        if (ctdb->methods) {
@@ -3492,6 +4125,7 @@ int switch_from_server_to_client(struct ctdb_context *ctdb)
        /* get a new event context */
        talloc_free(ctdb->ev);
        ctdb->ev = event_context_init(ctdb);
+       tevent_loop_allow_nesting(ctdb->ev);
 
        close(ctdb->daemon.sd);
        ctdb->daemon.sd = -1;
@@ -3512,129 +4146,35 @@ int switch_from_server_to_client(struct ctdb_context *ctdb)
 }
 
 /*
-  tell the main daemon we are starting a new monitor event script
- */
-int ctdb_ctrl_event_script_init(struct ctdb_context *ctdb)
-{
-       int ret;
-       int32_t res;
-
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_INIT, 0, tdb_null, 
-                          ctdb, NULL, &res, NULL, NULL);
-       if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_init\n"));
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
-  tell the main daemon we are starting a new monitor event script
- */
-int ctdb_ctrl_event_script_finished(struct ctdb_context *ctdb)
-{
-       int ret;
-       int32_t res;
-
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_FINISHED, 0, tdb_null, 
-                          ctdb, NULL, &res, NULL, NULL);
-       if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_init\n"));
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
-  tell the main daemon we are starting to run an eventscript
- */
-int ctdb_ctrl_event_script_start(struct ctdb_context *ctdb, const char *name)
-{
-       int ret;
-       int32_t res;
-       TDB_DATA data;
-
-       data.dptr = discard_const(name);
-       data.dsize = strlen(name)+1;
-
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_START, 0, data, 
-                          ctdb, NULL, &res, NULL, NULL);
-       if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_start\n"));
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
-  tell the main daemon the status of the script we ran
- */
-int ctdb_ctrl_event_script_stop(struct ctdb_context *ctdb, int32_t result)
-{
-       int ret;
-       int32_t res;
-       TDB_DATA data;
-
-       data.dptr = (uint8_t *)&result;
-       data.dsize = sizeof(result);
-
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_STOP, 0, data, 
-                          ctdb, NULL, &res, NULL, NULL);
-       if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_stop\n"));
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
-  tell the main daemon a script was disabled
- */
-int ctdb_ctrl_event_script_disabled(struct ctdb_context *ctdb, const char *name)
-{
-       int ret;
-       int32_t res;
-       TDB_DATA data;
-
-       data.dptr = discard_const(name);
-       data.dsize = strlen(name)+1;
-
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_DISABLED, 0, data, 
-                          ctdb, NULL, &res, NULL, NULL);
-       if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_disabeld\n"));
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
-  get the status of running the monitor eventscripts
+  get the status of running the monitor eventscripts: NULL means never run.
  */
 int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb, 
                struct timeval timeout, uint32_t destnode, 
-               TALLOC_CTX *mem_ctx,
-               struct ctdb_monitoring_wire **script_status)
+               TALLOC_CTX *mem_ctx, enum ctdb_eventscript_call type,
+               struct ctdb_scripts_wire **script_status)
 {
        int ret;
-       TDB_DATA outdata;
+       TDB_DATA outdata, indata;
        int32_t res;
+       uint32_t uinttype = type;
+
+       indata.dptr = (uint8_t *)&uinttype;
+       indata.dsize = sizeof(uinttype);
 
        ret = ctdb_control(ctdb, destnode, 0, 
-                          CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS, 0, tdb_null, 
+                          CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS, 0, indata,
                           mem_ctx, &outdata, &res, &timeout, NULL);
-       if (ret != 0 || res != 0 || outdata.dsize == 0) {
+       if (ret != 0 || res != 0) {
                DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getscriptstatus failed ret:%d res:%d\n", ret, res));
                return -1;
        }
 
-       *script_status = (struct ctdb_monitoring_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
-       talloc_free(outdata.dptr);
+       if (outdata.dsize == 0) {
+               *script_status = NULL;
+       } else {
+               *script_status = (struct ctdb_scripts_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+               talloc_free(outdata.dptr);
+       }
                    
        return 0;
 }
@@ -3910,3 +4450,192 @@ int ctdb_ctrl_get_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_
 }
 
 
+int ctdb_ctrl_set_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_db_priority *db_prio)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       data.dptr = (uint8_t*)db_prio;
+       data.dsize = sizeof(*db_prio);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DB_PRIORITY, 0, data,
+                          tmp_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set_db_priority failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+int ctdb_ctrl_get_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t db_id, uint32_t *priority)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       data.dptr = (uint8_t*)&db_id;
+       data.dsize = sizeof(db_id);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_DB_PRIORITY, 0, data,
+                          tmp_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res < 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set_db_priority failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (priority) {
+               *priority = res;
+       }
+
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+int ctdb_ctrl_getstathistory(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_statistics_wire **stats)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_STAT_HISTORY, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0 || outdata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getstathistory failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       *stats = (struct ctdb_statistics_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+struct ctdb_ltdb_header *ctdb_header_from_record_handle(struct ctdb_record_handle *h)
+{
+       if (h == NULL) {
+               return NULL;
+       }
+
+       return &h->header;
+}
+
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_client_control_state *handle;
+       struct ctdb_marshall_buffer *m;
+       struct ctdb_rec_data *rec;
+       TDB_DATA outdata;
+
+       m = talloc_zero(mem_ctx, struct ctdb_marshall_buffer);
+       if (m == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to allocate marshall buffer for update record\n"));
+               return NULL;
+       }
+
+       m->db_id = ctdb_db->db_id;
+
+       rec = ctdb_marshall_record(m, 0, key, header, data);
+       if (rec == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to marshall record for update record\n"));
+               talloc_free(m);
+               return NULL;
+       }
+       m = talloc_realloc_size(mem_ctx, m, rec->length + offsetof(struct ctdb_marshall_buffer, data));
+       if (m == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata\n"));
+               talloc_free(m);
+               return NULL;
+       }
+       m->count++;
+       memcpy((uint8_t *)m + offsetof(struct ctdb_marshall_buffer, data), rec, rec->length);
+
+
+       outdata.dptr = (uint8_t *)m;
+       outdata.dsize = talloc_get_size(m);
+
+       handle = ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_UPDATE_RECORD, 0, outdata,
+                          mem_ctx, &timeout, NULL);
+       talloc_free(m);
+       return handle;
+}
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, state, NULL, &res, NULL);
+       if ( (ret != 0) || (res != 0) ){
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_update_record_recv failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_updaterecord_send(ctdb, mem_ctx, timeout, destnode, ctdb_db, key, header, data);
+       return ctdb_ctrl_updaterecord_recv(ctdb, state);
+}
+
+
+
+
+
+
+/*
+  set a database to be readonly
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_readonly_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DB_READONLY, 0, data, 
+                          ctdb, NULL, NULL);
+}
+
+int ctdb_ctrl_set_db_readonly_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, ctdb, NULL, &res, NULL);
+       if (ret != 0 || res != 0) {
+         DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_set_db_readonly_recv failed  ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_set_db_readonly(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_set_db_readonly_send(ctdb, destnode, dbid);
+       return ctdb_ctrl_set_db_readonly_recv(ctdb, state);
+}