ctdb-readonly: Add an early return to simplify code

[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_call.c
diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c

index a94097ea64539e61f6941c9f7d16418a92c2fbed..7affe44a79545983ff83905532c5473d606b3723 100644 (file)
--- a/ctdb/server/ctdb_call.c
+++ b/ctdb/server/ctdb_call.c
@@ -21,12 +21,18 @@
    protocol design and packet details
  */
  #include "includes.h"
-#include "lib/tevent/tevent.h"
-#include "lib/tdb/include/tdb.h"
+#include "tdb.h"
  #include "lib/util/dlinklist.h"
  #include "system/network.h"
  #include "system/filesys.h"
  #include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+struct ctdb_sticky_record {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       TDB_CONTEXT *pindown;
+};
  
  /*
    find the ctdb_db from a db index
@@ -43,7 +49,6 @@
         return ctdb_db;
  }
  
-
  /*
    a varient of input packet that can be used in lock requeue
  */
@@ -108,36 +113,33 @@ static void ctdb_send_error(struct ctdb_context *ctdb,
   * to its local ctdb (ctdb_request_call). If the node is not itself
   * the record's DMASTER, it first redirects the packet to  the
   * record's LMASTER. The LMASTER then redirects the call packet to
- * the current DMASTER. But there is a race: The record may have
- * been migrated off the DMASTER while the redirected packet is
- * on the wire (or in the local queue). So in case the record has
- * migrated off the new destinaton of the call packet, instead of
- * going back to the LMASTER to get the new DMASTER, we try to
- * reduce rountrips by fist chasing the record a couple of times
- * before giving up the direct chase and finally going back to the
- * LMASTER (again). Note that this works because auf this: When
+ * the current DMASTER. Note that this works because of this: When
   * a record is migrated off a node, then the new DMASTER is stored
   * in the record's copy on the former DMASTER.
- *
- * The maxiumum number of attempts for direct chase to make before
- * going back to the LMASTER is configurable by the tunable
- * "MaxRedirectCount".
   */
-static void ctdb_call_send_redirect(struct ctdb_context *ctdb, 
+static void ctdb_call_send_redirect(struct ctdb_context *ctdb,
+                                   struct ctdb_db_context *ctdb_db,
                                     TDB_DATA key,
                                     struct ctdb_req_call *c, 
                                     struct ctdb_ltdb_header *header)
  {
-       
         uint32_t lmaster = ctdb_lmaster(ctdb, &key);
+
+       c->hdr.destnode = lmaster;
         if (ctdb->pnn == lmaster) {
                 c->hdr.destnode = header->dmaster;
-       } else if ((c->hopcount % ctdb->tunable.max_redirect_count) == 0) {
-               c->hdr.destnode = lmaster;
-       } else {
-               c->hdr.destnode = header->dmaster;
         }
         c->hopcount++;
+
+       if (c->hopcount%100 > 95) {
+               DEBUG(DEBUG_WARNING,("High hopcount %d dbid:%s "
+                       "key:0x%08x reqid=%08x pnn:%d src:%d lmaster:%d "
+                       "header->dmaster:%d dst:%d\n",
+                       c->hopcount, ctdb_db->db_name, ctdb_hash(&key),
+                       c->hdr.reqid, ctdb->pnn, c->hdr.srcnode, lmaster,
+                       header->dmaster, c->hdr.destnode));
+       }
+
         ctdb_queue_packet(ctdb, &c->hdr);
  }
  
@@ -259,6 +261,57 @@ static void ctdb_call_send_dmaster(struct ctdb_db_context *ctdb_db,
         talloc_free(r);
  }
  
+static void ctdb_sticky_pindown_timeout(struct event_context *ev, struct timed_event *te, 
+                                      struct timeval t, void *private_data)
+{
+       struct ctdb_sticky_record *sr = talloc_get_type(private_data, 
+                                                      struct ctdb_sticky_record);
+
+       DEBUG(DEBUG_ERR,("Pindown timeout db:%s  unstick record\n", sr->ctdb_db->db_name));
+       if (sr->pindown != NULL) {
+               talloc_free(sr->pindown);
+               sr->pindown = NULL;
+       }
+}
+
+static int
+ctdb_set_sticky_pindown(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *k;
+       struct ctdb_sticky_record *sr;
+
+       k = talloc_zero_size(tmp_ctx, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+       if (sr == NULL) {
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       talloc_free(tmp_ctx);
+
+       if (sr->pindown == NULL) {
+               DEBUG(DEBUG_ERR,("Pinning down record in %s for %d ms\n", ctdb_db->db_name, ctdb->tunable.sticky_pindown));
+               sr->pindown = talloc_new(sr);
+               if (sr->pindown == NULL) {
+                       DEBUG(DEBUG_ERR,("Failed to allocate pindown context for sticky record\n"));
+                       return -1;
+               }
+               event_add_timed(ctdb->ev, sr->pindown, timeval_current_ofs(ctdb->tunable.sticky_pindown / 1000, (ctdb->tunable.sticky_pindown * 1000) % 1000000), ctdb_sticky_pindown_timeout, sr);
+       }
+
+       return 0;
+}
+
  /*
    called when a CTDB_REPLY_DMASTER packet comes in, or when the lmaster
    gets a CTDB_REQUEST_DMASTER for itself. We become the dmaster.
@@ -278,8 +331,23 @@ static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
         DEBUG(DEBUG_DEBUG,("pnn %u dmaster response %08x\n", ctdb->pnn, ctdb_hash(&key)));
  
         ZERO_STRUCT(header);
-       header.rsn = rsn + 1;
+       header.rsn = rsn;
         header.dmaster = ctdb->pnn;
+       header.flags = record_flags;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_call_state);
+
+       if (state) {
+               if (state->call->flags & CTDB_CALL_FLAG_VACUUM_MIGRATION) {
+                       /*
+                        * We temporarily add the VACUUM_MIGRATED flag to
+                        * the record flags, so that ctdb_ltdb_store can
+                        * decide whether the record should be stored or
+                        * deleted.
+                        */
+                       header.flags |= CTDB_REC_FLAG_VACUUM_MIGRATED;
+               }
+       }
  
         if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
                 ctdb_fatal(ctdb, "ctdb_reply_dmaster store failed\n");
@@ -291,7 +359,13 @@ static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
                 return;
         }
  
-       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_call_state);
+       /* we just became DMASTER and this database is "sticky",
+          see if the record is flagged as "hot" and set up a pin-down
+          context to stop migrations for a little while if so
+       */
+       if (ctdb_db->sticky) {
+               ctdb_set_sticky_pindown(ctdb, ctdb_db, key);
+       }
  
         if (state == NULL) {
                 DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_become_dmaster from node %u\n",
@@ -325,7 +399,7 @@ static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
                 return;
         }
  
-       ctdb_call_local(ctdb_db, state->call, &header, state, &data);
+       ctdb_call_local(ctdb_db, state->call, &header, state, &data, true);
  
         ret = ctdb_ltdb_unlock(ctdb_db, state->call->key);
         if (ret != 0) {
@@ -376,7 +450,7 @@ void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr
         
         /* fetch the current record */
         ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, hdr, &data2,
-                                          ctdb_call_input_pkt, ctdb, False);
+                                          ctdb_call_input_pkt, ctdb, false);
         if (ret == -1) {
                 ctdb_fatal(ctdb, "ctdb_req_dmaster failed to fetch record");
                 return;
@@ -407,6 +481,7 @@ void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr
                         DEBUG(DEBUG_ERR,("ctdb_req_dmaster from non-master. Force a recovery.\n"));
  
                         ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+                       ctdb_ltdb_unlock(ctdb_db, key);
                         return;
                 }
         }
@@ -438,6 +513,208 @@ void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr
         }
  }
  
+static void ctdb_sticky_record_timeout(struct event_context *ev, struct timed_event *te, 
+                                      struct timeval t, void *private_data)
+{
+       struct ctdb_sticky_record *sr = talloc_get_type(private_data, 
+                                                      struct ctdb_sticky_record);
+       talloc_free(sr);
+}
+
+static void *ctdb_make_sticky_record_callback(void *parm, void *data)
+{
+        if (data) {
+               DEBUG(DEBUG_ERR,("Already have sticky record registered. Free old %p and create new %p\n", data, parm));
+                talloc_free(data);
+        }
+        return parm;
+}
+
+static int
+ctdb_make_record_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *k;
+       struct ctdb_sticky_record *sr;
+
+       k = talloc_zero_size(tmp_ctx, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+       if (sr != NULL) {
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       sr = talloc(ctdb_db->sticky_records, struct ctdb_sticky_record);
+       if (sr == NULL) {
+               talloc_free(tmp_ctx);
+               DEBUG(DEBUG_ERR,("Failed to allocate sticky record structure\n"));
+               return -1;
+       }
+
+       sr->ctdb    = ctdb;
+       sr->ctdb_db = ctdb_db;
+       sr->pindown = NULL;
+
+       DEBUG(DEBUG_ERR,("Make record sticky for %d seconds in db %s key:0x%08x.\n",
+                        ctdb->tunable.sticky_duration,
+                        ctdb_db->db_name, ctdb_hash(&key)));
+
+       trbt_insertarray32_callback(ctdb_db->sticky_records, k[0], &k[0], ctdb_make_sticky_record_callback, sr);
+
+       event_add_timed(ctdb->ev, sr, timeval_current_ofs(ctdb->tunable.sticky_duration, 0), ctdb_sticky_record_timeout, sr);
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+struct pinned_down_requeue_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+};
+
+struct pinned_down_deferred_call {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+};
+
+static void pinned_down_requeue(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct pinned_down_requeue_handle *handle = talloc_get_type(private_data, struct pinned_down_requeue_handle);
+       struct ctdb_context *ctdb = handle->ctdb;
+
+       talloc_steal(ctdb, handle->hdr);
+       ctdb_call_input_pkt(ctdb, handle->hdr);
+
+       talloc_free(handle);
+}
+
+static int pinned_down_destructor(struct pinned_down_deferred_call *pinned_down)
+{
+       struct ctdb_context *ctdb = pinned_down->ctdb;
+       struct pinned_down_requeue_handle *handle = talloc(ctdb, struct pinned_down_requeue_handle);
+
+       handle->ctdb = pinned_down->ctdb;
+       handle->hdr  = pinned_down->hdr;
+       talloc_steal(handle, handle->hdr);
+
+       event_add_timed(ctdb->ev, handle, timeval_zero(), pinned_down_requeue, handle);
+
+       return 0;
+}
+
+static int
+ctdb_defer_pinned_down_request(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *k;
+       struct ctdb_sticky_record *sr;
+       struct pinned_down_deferred_call *pinned_down;
+
+       k = talloc_zero_size(tmp_ctx, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+       if (sr == NULL) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+
+       if (sr->pindown == NULL) {
+               return -1;
+       }
+       
+       pinned_down = talloc(sr->pindown, struct pinned_down_deferred_call);
+       if (pinned_down == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate structure for deferred pinned down request\n"));
+               return -1;
+       }
+
+       pinned_down->ctdb = ctdb;
+       pinned_down->hdr  = hdr;
+
+       talloc_set_destructor(pinned_down, pinned_down_destructor);
+       talloc_steal(pinned_down, hdr);
+
+       return 0;
+}
+
+static void
+ctdb_update_db_stat_hot_keys(struct ctdb_db_context *ctdb_db, TDB_DATA key, int hopcount)
+{
+       int i, id;
+
+       /* smallest value is always at index 0 */
+       if (hopcount <= ctdb_db->statistics.hot_keys[0].count) {
+               return;
+       }
+
+       /* see if we already know this key */
+       for (i = 0; i < MAX_HOT_KEYS; i++) {
+               if (key.dsize != ctdb_db->statistics.hot_keys[i].key.dsize) {
+                       continue;
+               }
+               if (memcmp(key.dptr, ctdb_db->statistics.hot_keys[i].key.dptr, key.dsize)) {
+                       continue;
+               }
+               /* found an entry for this key */
+               if (hopcount <= ctdb_db->statistics.hot_keys[i].count) {
+                       return;
+               }
+               ctdb_db->statistics.hot_keys[i].count = hopcount;
+               goto sort_keys;
+       }
+
+       if (ctdb_db->statistics.num_hot_keys < MAX_HOT_KEYS) {
+               id = ctdb_db->statistics.num_hot_keys;
+               ctdb_db->statistics.num_hot_keys++;
+       } else {
+               id = 0;
+       }
+
+       if (ctdb_db->statistics.hot_keys[id].key.dptr != NULL) {
+               talloc_free(ctdb_db->statistics.hot_keys[id].key.dptr);
+       }
+       ctdb_db->statistics.hot_keys[id].key.dsize = key.dsize;
+       ctdb_db->statistics.hot_keys[id].key.dptr  = talloc_memdup(ctdb_db, key.dptr, key.dsize);
+       ctdb_db->statistics.hot_keys[id].count = hopcount;
+       DEBUG(DEBUG_NOTICE,("Updated hot key database=%s key=0x%08x id=%d hop_count=%d\n",
+                           ctdb_db->db_name, ctdb_hash(&key), id, hopcount));
+
+sort_keys:
+       for (i = 1; i < MAX_HOT_KEYS; i++) {
+               if (ctdb_db->statistics.hot_keys[i].count == 0) {
+                       continue;
+               }
+               if (ctdb_db->statistics.hot_keys[i].count < ctdb_db->statistics.hot_keys[0].count) {
+                       hopcount = ctdb_db->statistics.hot_keys[i].count;
+                       ctdb_db->statistics.hot_keys[i].count = ctdb_db->statistics.hot_keys[0].count;
+                       ctdb_db->statistics.hot_keys[0].count = hopcount;
+
+                       key = ctdb_db->statistics.hot_keys[i].key;
+                       ctdb_db->statistics.hot_keys[i].key = ctdb_db->statistics.hot_keys[0].key;
+                       ctdb_db->statistics.hot_keys[0].key = key;
+               }
+       }
+}
  
  /*
    called when a CTDB_REQ_CALL packet comes in
@@ -451,6 +728,7 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
         struct ctdb_ltdb_header header;
         struct ctdb_call *call;
         struct ctdb_db_context *ctdb_db;
+       int tmp_count, bucket;
  
         if (ctdb->methods == NULL) {
                 DEBUG(DEBUG_INFO,(__location__ " Failed ctdb_request_call. Transport is DOWN\n"));
@@ -474,43 +752,196 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
         call->key.dsize = c->keylen;
         call->call_data.dptr = c->data + c->keylen;
         call->call_data.dsize = c->calldatalen;
+       call->reply_data.dptr  = NULL;
+       call->reply_data.dsize = 0;
+
+
+       /* If this record is pinned down we should defer the
+          request until the pindown times out
+       */
+       if (ctdb_db->sticky) {
+               if (ctdb_defer_pinned_down_request(ctdb, ctdb_db, call->key, hdr) == 0) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Defer request for pinned down record in %s\n", ctdb_db->db_name));
+                       talloc_free(call);
+                       return;
+               }
+       }
+
  
         /* determine if we are the dmaster for this key. This also
            fetches the record data (if any), thus avoiding a 2nd fetch of the data 
            if the call will be answered locally */
  
         ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, call->key, &header, hdr, &data,
-                                          ctdb_call_input_pkt, ctdb, False);
+                                          ctdb_call_input_pkt, ctdb, false);
         if (ret == -1) {
                 ctdb_send_error(ctdb, hdr, ret, "ltdb fetch failed in ctdb_request_call");
+               talloc_free(call);
                 return;
         }
         if (ret == -2) {
                 DEBUG(DEBUG_INFO,(__location__ " deferred ctdb_request_call\n"));
+               talloc_free(call);
                 return;
         }
  
-       /* if we are not the dmaster, then send a redirect to the
-          requesting node */
-       if (header.dmaster != ctdb->pnn) {
+       /* Dont do READONLY if we dont have a tracking database */
+       if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db->readonly) {
+               c->flags &= ~CTDB_WANT_READONLY;
+       }
+
+       if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+               header.flags &= ~CTDB_REC_RO_FLAGS;
+               CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+               CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+               if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+               }
+               /* and clear out the tracking data */
+               if (tdb_delete(ctdb_db->rottdb, call->key) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+               }
+       }
+
+       /* if we are revoking, we must defer all other calls until the revoke
+        * had completed.
+        */
+       if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
                 talloc_free(data.dptr);
-               ctdb_call_send_redirect(ctdb, call->key, c, &header);
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+               if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+               }
+               talloc_free(call);
+               return;
+       }
+
+       /*
+        * If we are not the dmaster and are not hosting any delegations,
+        * then we redirect the request to the node than can answer it
+        * (the lmaster or the dmaster).
+        */
+       if ((header.dmaster != ctdb->pnn) 
+           && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) ) {
+               talloc_free(data.dptr);
+               ctdb_call_send_redirect(ctdb, ctdb_db, call->key, c, &header);
+
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+               talloc_free(call);
+               return;
+       }
+
+       if ( (!(c->flags & CTDB_WANT_READONLY))
+       && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+               header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+               if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+               }
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+               if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to start record revoke");
+               }
+               talloc_free(data.dptr);
+
+               if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+               }
+               talloc_free(call);
+
+               return;
+       }               
+
+       /* If this is the first request for delegation. bump rsn and set
+        * the delegations flag
+        */
+       if ((c->flags & CTDB_WANT_READONLY)
+       &&  (c->callid == CTDB_FETCH_WITH_HEADER_FUNC)
+       &&  (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS))) {
+               header.rsn     += 3;
+               header.flags   |= CTDB_REC_RO_HAVE_DELEGATIONS;
+               if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+               }
+       }
+       if ((c->flags & CTDB_WANT_READONLY) 
+       &&  (call->call_id == CTDB_FETCH_WITH_HEADER_FUNC)) {
+               TDB_DATA tdata;
+
+               tdata = tdb_fetch(ctdb_db->rottdb, call->key);
+               if (ctdb_trackingdb_add_pnn(ctdb, &tdata, c->hdr.srcnode) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add node to trackingdb");
+               }
+               if (tdb_store(ctdb_db->rottdb, call->key, tdata, TDB_REPLACE) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store trackingdb data");
+               }
+               free(tdata.dptr);
  
                 ret = ctdb_ltdb_unlock(ctdb_db, call->key);
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
                 }
+
+               len = offsetof(struct ctdb_reply_call, data) + data.dsize + sizeof(struct ctdb_ltdb_header);
+               r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, 
+                                           struct ctdb_reply_call);
+               CTDB_NO_MEMORY_FATAL(ctdb, r);
+               r->hdr.destnode  = c->hdr.srcnode;
+               r->hdr.reqid     = c->hdr.reqid;
+               r->status        = 0;
+               r->datalen       = data.dsize + sizeof(struct ctdb_ltdb_header);
+               header.rsn      -= 2;
+               header.flags   |= CTDB_REC_RO_HAVE_READONLY;
+               header.flags   &= ~CTDB_REC_RO_HAVE_DELEGATIONS;
+               memcpy(&r->data[0], &header, sizeof(struct ctdb_ltdb_header));
+
+               if (data.dsize) {
+                       memcpy(&r->data[sizeof(struct ctdb_ltdb_header)], data.dptr, data.dsize);
+               }
+
+               ctdb_queue_packet(ctdb, &r->hdr);
+               CTDB_INCREMENT_STAT(ctdb, total_ro_delegations);
+               CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_delegations);
+
+               talloc_free(r);
+               talloc_free(call);
                 return;
         }
  
         CTDB_UPDATE_STAT(ctdb, max_hop_count, c->hopcount);
+       tmp_count = c->hopcount;
+       bucket = 0;
+       while (tmp_count) {
+               tmp_count >>= 2;
+               bucket++;
+       }
+       if (bucket >= MAX_COUNT_BUCKETS) {
+               bucket = MAX_COUNT_BUCKETS - 1;
+       }
+       CTDB_INCREMENT_STAT(ctdb, hop_count_bucket[bucket]);
+       CTDB_INCREMENT_DB_STAT(ctdb_db, hop_count_bucket[bucket]);
+       ctdb_update_db_stat_hot_keys(ctdb_db, call->key, c->hopcount);
+
+       /* If this database supports sticky records, then check if the
+          hopcount is big. If it is it means the record is hot and we
+          should make it sticky.
+       */
+       if (ctdb_db->sticky && c->hopcount >= ctdb->tunable.hopcount_make_sticky) {
+               ctdb_make_record_sticky(ctdb, ctdb_db, call->key);
+       }
+
  
         /* Try if possible to migrate the record off to the caller node.
          * From the clients perspective a fetch of the data is just as 
          * expensive as a migration.
          */
         if (c->hdr.srcnode != ctdb->pnn) {
-               if (ctdb_db->transaction_active) {
+               if (ctdb_db->persistent_state) {
                         DEBUG(DEBUG_INFO, (__location__ " refusing migration"
                               " of key %s while transaction is active\n",
                               (char *)call->key.dptr));
@@ -524,11 +955,16 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
                         }
-                       return;
                 }
+               talloc_free(call);
+               return;
         }
  
-       ctdb_call_local(ctdb_db, call, &header, hdr, &data);
+       ret = ctdb_call_local(ctdb_db, call, &header, hdr, &data, true);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_call_local failed\n"));
+               call->status = -1;
+       }
  
         ret = ctdb_ltdb_unlock(ctdb_db, call->key);
         if (ret != 0) {
@@ -550,14 +986,15 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
         ctdb_queue_packet(ctdb, &r->hdr);
  
         talloc_free(r);
+       talloc_free(call);
  }
  
-/*
-  called when a CTDB_REPLY_CALL packet comes in
-
-  This packet comes in response to a CTDB_REQ_CALL request packet. It
-  contains any reply data from the call
-*/
+/**
+ * called when a CTDB_REPLY_CALL packet comes in
+ *
+ * This packet comes in response to a CTDB_REQ_CALL request packet. It
+ * contains any reply data from the call
+ */
  void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
  {
         struct ctdb_reply_call *c = (struct ctdb_reply_call *)hdr;
@@ -575,6 +1012,66 @@ void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
                 return;
         }
  
+
+       /* read only delegation processing */
+       /* If we got a FETCH_WITH_HEADER we should check if this is a ro
+        * delegation since we may need to update the record header
+        */
+       if (state->c->callid == CTDB_FETCH_WITH_HEADER_FUNC) {
+               struct ctdb_db_context *ctdb_db = state->ctdb_db;
+               struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)&c->data[0];
+               struct ctdb_ltdb_header oldheader;
+               TDB_DATA key, data, olddata;
+               int ret;
+
+               if (!(header->flags & CTDB_REC_RO_HAVE_READONLY)) {
+                       goto finished_ro;
+                       return;
+               }
+
+               key.dsize = state->c->keylen;
+               key.dptr  = state->c->data;
+               ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+                                    ctdb_call_input_pkt, ctdb, false);
+               if (ret == -2) {
+                       return;
+               }
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_call\n"));
+                       return;
+               }
+
+               ret = ctdb_ltdb_fetch(ctdb_db, key, &oldheader, state, &olddata);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Failed to fetch old record in ctdb_reply_call\n"));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }                       
+
+               if (header->rsn <= oldheader.rsn) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }
+
+               if (c->datalen < sizeof(struct ctdb_ltdb_header)) {
+                       DEBUG(DEBUG_ERR,(__location__ " Got FETCH_WITH_HEADER reply with too little data: %d bytes\n", c->datalen));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }
+
+               data.dsize = c->datalen - sizeof(struct ctdb_ltdb_header);
+               data.dptr  = &c->data[sizeof(struct ctdb_ltdb_header)];
+               ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Failed to store new record in ctdb_reply_call\n"));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }                       
+
+               ctdb_ltdb_unlock(ctdb_db, key);
+       }
+finished_ro:
+
         state->call->reply_data.dptr = c->data;
         state->call->reply_data.dsize = c->datalen;
         state->call->status = c->status;
@@ -588,13 +1085,13 @@ void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
  }
  
  
-/*
-  called when a CTDB_REPLY_DMASTER packet comes in
-
-  This packet comes in from the lmaster response to a CTDB_REQ_CALL
-  request packet. It means that the current dmaster wants to give us
-  the dmaster role
-*/
+/**
+ * called when a CTDB_REPLY_DMASTER packet comes in
+ *
+ * This packet comes in from the lmaster in response to a CTDB_REQ_CALL
+ * request packet. It means that the current dmaster wants to give us
+ * the dmaster role.
+ */
  void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
  {
         struct ctdb_reply_dmaster *c = (struct ctdb_reply_dmaster *)hdr;
@@ -621,7 +1118,7 @@ void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
         }
  
         ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
-                                    ctdb_call_input_pkt, ctdb, False);
+                                    ctdb_call_input_pkt, ctdb, false);
         if (ret == -2) {
                 return;
         }
@@ -751,7 +1248,10 @@ struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db,
         *(state->call) = *call;
         state->ctdb_db = ctdb_db;
  
-       ret = ctdb_call_local(ctdb_db, state->call, header, state, data);
+       ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+       }
  
         event_add_timed(ctdb->ev, state, timeval_zero(), call_local_trigger, state);
  
@@ -875,3 +1375,365 @@ void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode)
  
         talloc_free(r);
  }
+
+
+
+struct revokechild_deferred_call {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+       deferred_requeue_fn fn;
+       void *ctx;
+};
+
+struct revokechild_handle {
+       struct revokechild_handle *next, *prev;
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct fd_event *fde;
+       int status;
+       int fd[2];
+       pid_t child;
+       TDB_DATA key;
+};
+
+struct revokechild_requeue_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+       deferred_requeue_fn fn;
+       void *ctx;
+};
+
+static void deferred_call_requeue(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct revokechild_requeue_handle *requeue_handle = talloc_get_type(private_data, struct revokechild_requeue_handle);
+
+       requeue_handle->fn(requeue_handle->ctx, requeue_handle->hdr);
+       talloc_free(requeue_handle);
+}
+
+static int deferred_call_destructor(struct revokechild_deferred_call *deferred_call)
+{
+       struct ctdb_context *ctdb = deferred_call->ctdb;
+       struct revokechild_requeue_handle *requeue_handle = talloc(ctdb, struct revokechild_requeue_handle);
+       struct ctdb_req_call *c = (struct ctdb_req_call *)deferred_call->hdr;
+
+       requeue_handle->ctdb = ctdb;
+       requeue_handle->hdr  = deferred_call->hdr;
+       requeue_handle->fn   = deferred_call->fn;
+       requeue_handle->ctx  = deferred_call->ctx;
+       talloc_steal(requeue_handle, requeue_handle->hdr);
+
+       /* when revoking, any READONLY requests have 1 second grace to let read/write finish first */
+       event_add_timed(ctdb->ev, requeue_handle, timeval_current_ofs(c->flags & CTDB_WANT_READONLY ? 1 : 0, 0), deferred_call_requeue, requeue_handle);
+
+       return 0;
+}
+
+
+static int revokechild_destructor(struct revokechild_handle *rc)
+{
+       if (rc->fde != NULL) {
+               talloc_free(rc->fde);
+       }
+
+       if (rc->fd[0] != -1) {
+               close(rc->fd[0]);
+       }
+       if (rc->fd[1] != -1) {
+               close(rc->fd[1]);
+       }
+       ctdb_kill(rc->ctdb, rc->child, SIGKILL);
+
+       DLIST_REMOVE(rc->ctdb_db->revokechild_active, rc);
+       return 0;
+}
+
+static void revokechild_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct revokechild_handle *rc = talloc_get_type(private_data, 
+                                                    struct revokechild_handle);
+       int ret;
+       char c;
+
+       ret = read(rc->fd[0], &c, 1);
+       if (ret != 1) {
+               DEBUG(DEBUG_ERR,("Failed to read status from revokechild. errno:%d\n", errno));
+               rc->status = -1;
+               talloc_free(rc);
+               return;
+       }
+       if (c != 0) {
+               DEBUG(DEBUG_ERR,("revokechild returned failure. status:%d\n", c));
+               rc->status = -1;
+               talloc_free(rc);
+               return;
+       }
+
+       talloc_free(rc);
+}
+
+struct ctdb_revoke_state {
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA key;
+       struct ctdb_ltdb_header *header;
+       TDB_DATA data;
+       int count;
+       int status;
+       int finished;
+};
+
+static void update_record_cb(struct ctdb_client_control_state *state)
+{
+       struct ctdb_revoke_state *revoke_state;
+       int ret;
+       int32_t res;
+
+       if (state == NULL) {
+               return;
+       }
+       revoke_state = state->async.private_data;
+
+       state->async.fn = NULL;
+        ret = ctdb_control_recv(state->ctdb, state, state, NULL, &res, NULL);
+        if ((ret != 0) || (res != 0)) {
+               DEBUG(DEBUG_ERR,("Recv for revoke update record failed ret:%d res:%d\n", ret, res));
+               revoke_state->status = -1;
+       }
+
+       revoke_state->count--;
+       if (revoke_state->count <= 0) {
+               revoke_state->finished = 1;
+       }
+}
+
+static void revoke_send_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+       struct ctdb_revoke_state *revoke_state = private_data;
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_updaterecord_send(ctdb, revoke_state, timeval_current_ofs(ctdb->tunable.control_timeout,0), pnn, revoke_state->ctdb_db, revoke_state->key, revoke_state->header, revoke_state->data);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,("Failure to send update record to revoke readonly delegation\n"));
+               revoke_state->status = -1;
+               return;
+       }
+       state->async.fn           = update_record_cb;
+       state->async.private_data = revoke_state;
+
+       revoke_state->count++;
+
+}
+
+static void ctdb_revoke_timeout_handler(struct event_context *ev, struct timed_event *te, 
+                             struct timeval yt, void *private_data)
+{
+       struct ctdb_revoke_state *state = private_data;
+
+       DEBUG(DEBUG_ERR,("Timed out waiting for revoke to finish\n"));
+       state->finished = 1;
+       state->status   = -1;
+}
+
+static int ctdb_revoke_all_delegations(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA tdata, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_revoke_state *state = talloc_zero(ctdb, struct ctdb_revoke_state);
+       int status;
+       struct ctdb_ltdb_header new_header;
+       TDB_DATA new_data;
+
+       state->ctdb_db = ctdb_db;
+       state->key     = key;
+       state->header  = header;
+       state->data    = data;
+ 
+       ctdb_trackingdb_traverse(ctdb, tdata, revoke_send_cb, state);
+
+       event_add_timed(ctdb->ev, state, timeval_current_ofs(ctdb->tunable.control_timeout, 0), ctdb_revoke_timeout_handler, state);
+
+       while (state->finished == 0) {
+               event_loop_once(ctdb->ev);
+       }
+
+       status = state->status;
+       if (status != 0) {
+               talloc_free(state);
+               return staus;
+       }
+
+       if (ctdb_ltdb_lock(ctdb_db, key) != 0) {
+               DEBUG(DEBUG_ERR,("Failed to chainlock the database in revokechild\n"));
+               talloc_free(state);
+               return -1;
+       }
+       if (ctdb_ltdb_fetch(ctdb_db, key, &new_header, state, &new_data) != 0) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               DEBUG(DEBUG_ERR,("Failed for fetch tdb record in revokechild\n"));
+               talloc_free(state);
+               return -1;
+       }
+       header->rsn++;
+       if (new_header.rsn > header->rsn) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               DEBUG(DEBUG_ERR,("RSN too high in tdb record in revokechild\n"));
+               talloc_free(state);
+               return -1;
+       }
+       if ( (new_header.flags & (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS)) != (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS) ) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               DEBUG(DEBUG_ERR,("Flags are wrong in tdb record in revokechild\n"));
+               talloc_free(state);
+               return -1;
+       }
+       new_header.rsn++;
+       new_header.flags |= CTDB_REC_RO_REVOKE_COMPLETE;
+       if (ctdb_ltdb_store(ctdb_db, key, &new_header, new_data) != 0) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               DEBUG(DEBUG_ERR,("Failed to write new record in revokechild\n"));
+               talloc_free(state);
+               return -1;
+       }
+       ctdb_ltdb_unlock(ctdb_db, key);
+
+       talloc_free(state);
+       return status;
+}
+
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       TDB_DATA tdata;
+       struct revokechild_handle *rc;
+       pid_t parent = getpid();
+       int ret;
+
+       header->flags &= ~(CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY);
+       header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+       header->rsn   -= 1;
+
+       if ((rc = talloc_zero(ctdb_db, struct revokechild_handle)) == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate revokechild_handle\n"));
+               return -1;
+       }
+
+       tdata = tdb_fetch(ctdb_db->rottdb, key);
+       if (tdata.dsize > 0) {
+               uint8_t *tmp;
+
+               tmp = tdata.dptr;
+               tdata.dptr = talloc_memdup(rc, tdata.dptr, tdata.dsize);
+               free(tmp);
+       }
+
+       rc->status    = 0;
+       rc->ctdb      = ctdb;
+       rc->ctdb_db   = ctdb_db;
+       rc->fd[0]     = -1;
+       rc->fd[1]     = -1;
+
+       talloc_set_destructor(rc, revokechild_destructor);
+
+       rc->key.dsize = key.dsize;
+       rc->key.dptr  = talloc_memdup(rc, key.dptr, key.dsize);
+       if (rc->key.dptr == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for revokechild_handle\n"));
+               talloc_free(rc);
+               return -1;
+       }
+
+       ret = pipe(rc->fd);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for revokechild_handle\n"));
+               talloc_free(rc);
+               return -1;
+       }
+
+
+       rc->child = ctdb_fork(ctdb);
+       if (rc->child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR,("Failed to fork child for revokechild\n"));
+               talloc_free(rc);
+               return -1;
+       }
+
+       if (rc->child == 0) {
+               char c = 0;
+               close(rc->fd[0]);
+               debug_extra = talloc_asprintf(NULL, "revokechild-%s:", ctdb_db->db_name);
+
+               ctdb_set_process_name("ctdb_revokechild");
+               if (switch_from_server_to_client(ctdb, "revokechild-%s", ctdb_db->db_name) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to switch from server to client for revokechild process\n"));
+                       c = 1;
+                       goto child_finished;
+               }
+
+               c = ctdb_revoke_all_delegations(ctdb, ctdb_db, tdata, key, header, data);
+
+child_finished:
+               write(rc->fd[1], &c, 1);
+               /* make sure we die when our parent dies */
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+
+       close(rc->fd[1]);
+       rc->fd[1] = -1;
+       set_close_on_exec(rc->fd[0]);
+
+       /* This is an active revokechild child process */
+       DLIST_ADD_END(ctdb_db->revokechild_active, rc, NULL);
+
+       rc->fde = event_add_fd(ctdb->ev, rc, rc->fd[0],
+                                  EVENT_FD_READ, revokechild_handler,
+                                  (void *)rc);
+       if (rc->fde == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to set up fd event for revokechild process\n"));
+               talloc_free(rc);
+       }
+       tevent_fd_set_auto_close(rc->fde);
+
+       return 0;
+}
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context)
+{
+       struct revokechild_handle *rc;
+       struct revokechild_deferred_call *deferred_call;
+
+       for (rc = ctdb_db->revokechild_active; rc; rc = rc->next) {
+               if (rc->key.dsize == 0) {
+                       continue;
+               }
+               if (rc->key.dsize != key.dsize) {
+                       continue;
+               }
+               if (!memcmp(rc->key.dptr, key.dptr, key.dsize)) {
+                       break;
+               }
+       }
+
+       if (rc == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to add deferred call to revoke list. revoke structure not found\n"));
+               return -1;
+       }
+
+       deferred_call = talloc(rc, struct revokechild_deferred_call);
+       if (deferred_call == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n"));
+               return -1;
+       }
+
+       deferred_call->ctdb = ctdb;
+       deferred_call->hdr  = hdr;
+       deferred_call->fn   = fn;
+       deferred_call->ctx  = call_context;
+
+       talloc_set_destructor(deferred_call, deferred_call_destructor);
+       talloc_steal(deferred_call, hdr);
+
+       return 0;
+}