dsdb: Remove dead code in partition_prep_request()
[samba.git] / source4 / dsdb / samdb / ldb_modules / partition.c
index 779b8b5a35b2737756fe855fc2d2ce1295de8a25..d44dc19320a33632a1a46e24b85d01f647c81d5d 100644 (file)
@@ -39,11 +39,12 @@ struct part_request {
 struct partition_context {
        struct ldb_module *module;
        struct ldb_request *req;
-       bool got_success;
 
        struct part_request *part_req;
-       int num_requests;
-       int finished_requests;
+       unsigned int num_requests;
+       unsigned int finished_requests;
+
+       const char **referrals;
 };
 
 static struct partition_context *partition_init_ctx(struct ldb_module *module, struct ldb_request *req)
@@ -63,60 +64,34 @@ static struct partition_context *partition_init_ctx(struct ldb_module *module, s
 }
 
 /*
- *    helper functions to call the next module in chain
- *    */
-
+ * helper functions to call the next module in chain
+ */
 int partition_request(struct ldb_module *module, struct ldb_request *request)
 {
-       int ret;
-       switch (request->operation) {
-       case LDB_SEARCH:
-               PARTITION_FIND_OP(module, search);
-               ret = module->ops->search(module, request);
-               break;
-       case LDB_ADD:
-               PARTITION_FIND_OP(module, add);
-               ret = module->ops->add(module, request);
-               break;
-       case LDB_MODIFY:
-               PARTITION_FIND_OP(module, modify);
-               ret = module->ops->modify(module, request);
-               break;
-       case LDB_DELETE:
-               PARTITION_FIND_OP(module, del);
-               ret = module->ops->del(module, request);
-               break;
-       case LDB_RENAME:
-               PARTITION_FIND_OP(module, rename);
-               ret = module->ops->rename(module, request);
-               break;
-       case LDB_EXTENDED:
-               PARTITION_FIND_OP(module, extended);
-               ret = module->ops->extended(module, request);
-               break;
-       default:
-               PARTITION_FIND_OP(module, request);
-               ret = module->ops->request(module, request);
-               break;
-       }
-       if (ret == LDB_SUCCESS) {
-               return ret;
-       }
-       if (!ldb_errstring(ldb_module_get_ctx(module))) {
-               /* Set a default error string, to place the blame somewhere */
-               ldb_asprintf_errstring(ldb_module_get_ctx(module),
-                                       "error in module %s: %s (%d)",
-                                       module->ops->name,
-                                       ldb_strerror(ret), ret);
+       if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) { \
+               const struct dsdb_control_current_partition *partition = NULL;
+               struct ldb_control *partition_ctrl = ldb_request_get_control(request, DSDB_CONTROL_CURRENT_PARTITION_OID);
+               if (partition_ctrl) {
+                       partition = talloc_get_type(partition_ctrl->data,
+                                                   struct dsdb_control_current_partition);
+               }
+
+               if (partition != NULL) {
+                       ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_request() -> %s",
+                                 ldb_dn_get_linearized(partition->dn));                        
+               } else {
+                       ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_request() -> (metadata partition)");
+               }
        }
-       return ret;
+
+       return ldb_next_request(module, request);
 }
 
 static struct dsdb_partition *find_partition(struct partition_private_data *data,
                                             struct ldb_dn *dn,
                                             struct ldb_request *req)
 {
-       int i;
+       unsigned int i;
        struct ldb_control *partition_ctrl;
 
        /* see if the request has the partition DN specified in a
@@ -159,11 +134,9 @@ static int partition_req_callback(struct ldb_request *req,
        struct ldb_module *module;
        struct ldb_request *nreq;
        int ret;
-       struct partition_private_data *data;
        struct ldb_control *partition_ctrl;
 
        ac = talloc_get_type(req->context, struct partition_context);
-       data = talloc_get_type(ac->module->private_data, struct partition_private_data);
 
        if (!ares) {
                return ldb_module_done(ac->req, NULL, NULL,
@@ -174,26 +147,25 @@ static int partition_req_callback(struct ldb_request *req,
        if (partition_ctrl && (ac->num_requests == 1 || ares->type == LDB_REPLY_ENTRY)) {
                /* If we didn't fan this request out to mulitple partitions,
                 * or this is an individual search result, we can
-                * deterministily tell the caller what partition this was
+                * deterministically tell the caller what partition this was
                 * written to (repl_meta_data likes to know) */
-                       ret = ldb_reply_add_control(ares,
-                                                   DSDB_CONTROL_CURRENT_PARTITION_OID,
-                                                   false, partition_ctrl->data);
-                       if (ret != LDB_SUCCESS) {
-                               return ldb_module_done(ac->req, NULL, NULL,
-                                                      ret);
-                       }
+               ret = ldb_reply_add_control(ares,
+                                           DSDB_CONTROL_CURRENT_PARTITION_OID,
+                                           false, partition_ctrl->data);
+               if (ret != LDB_SUCCESS) {
+                       return ldb_module_done(ac->req, NULL, NULL,
+                                              ret);
+               }
        }
 
-       if (ares->error != LDB_SUCCESS && !ac->got_success) {
+       if (ares->error != LDB_SUCCESS) {
                return ldb_module_done(ac->req, ares->controls,
                                        ares->response, ares->error);
        }
 
        switch (ares->type) {
        case LDB_REPLY_REFERRAL:
-               /* ignore referrals for now */
-               break;
+               return ldb_module_send_referral(ac->req, ares->referral);
 
        case LDB_REPLY_ENTRY:
                if (ac->req->operation != LDB_SEARCH) {
@@ -207,9 +179,6 @@ static int partition_req_callback(struct ldb_request *req,
                return ldb_module_send_entry(ac->req, ares->message, ares->controls);
 
        case LDB_REPLY_DONE:
-               if (ares->error == LDB_SUCCESS) {
-                       ac->got_success = true;
-               }
                if (ac->req->operation == LDB_EXTENDED) {
                        /* FIXME: check for ares->response, replmd does not fill it ! */
                        if (ares->response) {
@@ -227,10 +196,23 @@ static int partition_req_callback(struct ldb_request *req,
 
                ac->finished_requests++;
                if (ac->finished_requests == ac->num_requests) {
+                       /* Send back referrals if they do exist (search ops) */
+                       if (ac->referrals != NULL) {
+                               const char **ref;
+                               for (ref = ac->referrals; *ref != NULL; ++ref) {
+                                       ret = ldb_module_send_referral(ac->req,
+                                                                      talloc_strdup(ac->req, *ref));
+                                       if (ret != LDB_SUCCESS) {
+                                               return ldb_module_done(ac->req, NULL, NULL,
+                                                                      ret);
+                                       }
+                               }
+                       }
+
                        /* this was the last one, call callback */
                        return ldb_module_done(ac->req, ares->controls,
                                               ares->response, 
-                                              ac->got_success?LDB_SUCCESS:ares->error);
+                                              ares->error);
                }
 
                /* not the last, now call the next one */
@@ -255,13 +237,14 @@ static int partition_prep_request(struct partition_context *ac,
 {
        int ret;
        struct ldb_request *req;
+       struct ldb_control *partition_ctrl = NULL;
+       void *part_data = NULL;
 
        ac->part_req = talloc_realloc(ac, ac->part_req,
                                        struct part_request,
                                        ac->num_requests + 1);
        if (ac->part_req == NULL) {
-               ldb_oom(ldb_module_get_ctx(ac->module));
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_oom(ldb_module_get_ctx(ac->module));
        }
 
        switch (ac->req->operation) {
@@ -275,6 +258,7 @@ static int partition_prep_request(struct partition_context *ac,
                                        ac->req->controls,
                                        ac, partition_req_callback,
                                        ac->req);
+               LDB_REQ_SET_LOCATION(req);
                break;
        case LDB_ADD:
                ret = ldb_build_add_req(&req, ldb_module_get_ctx(ac->module), ac->part_req,
@@ -282,6 +266,7 @@ static int partition_prep_request(struct partition_context *ac,
                                        ac->req->controls,
                                        ac, partition_req_callback,
                                        ac->req);
+               LDB_REQ_SET_LOCATION(req);
                break;
        case LDB_MODIFY:
                ret = ldb_build_mod_req(&req, ldb_module_get_ctx(ac->module), ac->part_req,
@@ -289,6 +274,7 @@ static int partition_prep_request(struct partition_context *ac,
                                        ac->req->controls,
                                        ac, partition_req_callback,
                                        ac->req);
+               LDB_REQ_SET_LOCATION(req);
                break;
        case LDB_DELETE:
                ret = ldb_build_del_req(&req, ldb_module_get_ctx(ac->module), ac->part_req,
@@ -296,6 +282,7 @@ static int partition_prep_request(struct partition_context *ac,
                                        ac->req->controls,
                                        ac, partition_req_callback,
                                        ac->req);
+               LDB_REQ_SET_LOCATION(req);
                break;
        case LDB_RENAME:
                ret = ldb_build_rename_req(&req, ldb_module_get_ctx(ac->module), ac->part_req,
@@ -304,6 +291,7 @@ static int partition_prep_request(struct partition_context *ac,
                                        ac->req->controls,
                                        ac, partition_req_callback,
                                        ac->req);
+               LDB_REQ_SET_LOCATION(req);
                break;
        case LDB_EXTENDED:
                ret = ldb_build_extended_req(&req, ldb_module_get_ctx(ac->module),
@@ -313,6 +301,7 @@ static int partition_prep_request(struct partition_context *ac,
                                        ac->req->controls,
                                        ac, partition_req_callback,
                                        ac->req);
+               LDB_REQ_SET_LOCATION(req);
                break;
        default:
                ldb_set_errstring(ldb_module_get_ctx(ac->module),
@@ -327,40 +316,43 @@ static int partition_prep_request(struct partition_context *ac,
        ac->part_req[ac->num_requests].req = req;
 
        if (ac->req->controls) {
-               req->controls = talloc_memdup(req, ac->req->controls,
-                                       talloc_get_size(ac->req->controls));
-               if (req->controls == NULL) {
-                       ldb_oom(ldb_module_get_ctx(ac->module));
-                       return LDB_ERR_OPERATIONS_ERROR;
+               /* Duplicate everything beside the current partition control */
+               partition_ctrl = ldb_request_get_control(ac->req,
+                                                        DSDB_CONTROL_CURRENT_PARTITION_OID);
+               if (!ldb_save_controls(partition_ctrl, req, NULL)) {
+                       return ldb_module_oom(ac->module);
                }
        }
 
-       if (partition) {
-               ac->part_req[ac->num_requests].module = partition->module;
+       part_data = partition->ctrl;
 
-               if (!ldb_request_get_control(req, DSDB_CONTROL_CURRENT_PARTITION_OID)) {
-                       ret = ldb_request_add_control(req,
-                                                     DSDB_CONTROL_CURRENT_PARTITION_OID,
-                                                     false, partition->ctrl);
-                       if (ret != LDB_SUCCESS) {
-                               return ret;
-                       }
+       ac->part_req[ac->num_requests].module = partition->module;
+
+       if (partition_ctrl != NULL) {
+               if (partition_ctrl->data != NULL) {
+                       part_data = partition_ctrl->data;
                }
 
-               if (req->operation == LDB_SEARCH) {
-                       /* If the search is for 'more' than this partition,
-                        * then change the basedn, so a remote LDAP server
-                        * doesn't object */
-                       if (ldb_dn_compare_base(partition->ctrl->dn,
-                                               req->op.search.base) != 0) {
-                               req->op.search.base = partition->ctrl->dn;
-                       }
+               /*
+                * If the provided current partition control is without
+                * data then use the calculated one.
+                */
+               ret = ldb_request_add_control(req,
+                                             DSDB_CONTROL_CURRENT_PARTITION_OID,
+                                             false, part_data);
+               if (ret != LDB_SUCCESS) {
+                       return ret;
                }
+       }
 
-       } else {
-               /* make sure you put the NEXT module here, or
-                * partition_request() will simply loop forever on itself */
-               ac->part_req[ac->num_requests].module = ac->module->next;
+       if (req->operation == LDB_SEARCH) {
+               /* If the search is for 'more' than this partition,
+                * then change the basedn, so a remote LDAP server
+                * doesn't object */
+               if (ldb_dn_compare_base(partition->ctrl->dn,
+                                       req->op.search.base) != 0) {
+                       req->op.search.base = partition->ctrl->dn;
+               }
        }
 
        ac->num_requests++;
@@ -374,19 +366,17 @@ static int partition_call_first(struct partition_context *ac)
 }
 
 /**
- * Send a request down to all the partitions
+ * Send a request down to all the partitions (but not the sam.ldb file)
  */
 static int partition_send_all(struct ldb_module *module, 
                              struct partition_context *ac, 
                              struct ldb_request *req) 
 {
-       int i;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       unsigned int i;
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
-       int ret = partition_prep_request(ac, NULL);
-       if (ret != LDB_SUCCESS) {
-               return ret;
-       }
+       int ret;
+
        for (i=0; data && data->partitions && data->partitions[i]; i++) {
                ret = partition_prep_request(ac, data->partitions[i]);
                if (ret != LDB_SUCCESS) {
@@ -398,6 +388,296 @@ static int partition_send_all(struct ldb_module *module,
        return partition_call_first(ac);
 }
 
+struct partition_copy_context {
+       struct ldb_module *module;
+       struct partition_context *partition_context;
+       struct ldb_request *request;
+       struct ldb_dn *dn;
+};
+
+/*
+ * A special DN has been updated in the primary partition. Now propagate those
+ * changes to the remaining partitions.
+ *
+ * Note: that the operations are asynchronous and this function is called
+ *       from partition_copy_all_callback_handler in response to an async
+ *       callback.
+ */
+static int partition_copy_all_callback_action(
+       struct ldb_module *module,
+       struct partition_context *ac,
+       struct ldb_request *req,
+       struct ldb_dn *dn)
+
+{
+
+       unsigned int i;
+       struct partition_private_data *data =
+               talloc_get_type(
+                       ldb_module_get_private(module),
+                       struct partition_private_data);
+       int search_ret;
+       struct ldb_result *res;
+       /* now fetch the resulting object, and then copy it to all the
+        * other partitions. We need this approach to cope with the
+        * partitions getting out of sync. If for example the
+        * @ATTRIBUTES object exists on one partition but not the
+        * others then just doing each of the partitions in turn will
+        * lead to an error
+        */
+       search_ret = dsdb_module_search_dn(module, ac, &res, dn, NULL, DSDB_FLAG_NEXT_MODULE, req);
+       if (search_ret != LDB_SUCCESS) {
+               return search_ret;
+       }
+
+       /* now delete the object in the other partitions, if requried
+       */
+       if (search_ret == LDB_ERR_NO_SUCH_OBJECT) {
+               for (i=0; data->partitions && data->partitions[i]; i++) {
+                       int pret;
+                       pret = dsdb_module_del(data->partitions[i]->module,
+                                              dn,
+                                              DSDB_FLAG_NEXT_MODULE,
+                                              req);
+                       if (pret != LDB_SUCCESS && pret != LDB_ERR_NO_SUCH_OBJECT) {
+                               /* we should only get success or no
+                                  such object from the other partitions */
+                               return pret;
+                       }
+               }
+
+               return ldb_module_done(req, NULL, NULL, LDB_SUCCESS);
+       }
+
+       /* now add/modify in the other partitions */
+       for (i=0; data->partitions && data->partitions[i]; i++) {
+               struct ldb_message *modify_msg = NULL;
+               int pret;
+               unsigned int el_idx;
+
+               pret = dsdb_module_add(data->partitions[i]->module,
+                                      res->msgs[0],
+                                      DSDB_FLAG_NEXT_MODULE,
+                                      req);
+               if (pret == LDB_SUCCESS) {
+                       continue;
+               }
+
+               if (pret != LDB_ERR_ENTRY_ALREADY_EXISTS) {
+                       return pret;
+               }
+
+               modify_msg = ldb_msg_copy(req, res->msgs[0]);
+               if (modify_msg == NULL) {
+                       return ldb_module_oom(module);
+               }
+
+               /*
+                * mark all the message elements as
+                * LDB_FLAG_MOD_REPLACE
+                */
+               for (el_idx=0;
+                    el_idx < modify_msg->num_elements;
+                    el_idx++) {
+                       modify_msg->elements[el_idx].flags
+                               = LDB_FLAG_MOD_REPLACE;
+               }
+
+               if (req->operation == LDB_MODIFY) {
+                       const struct ldb_message *req_msg = req->op.mod.message;
+                       /*
+                        * mark elements to be removed, if there were
+                        * deleted entirely above we need to delete
+                        * them here too
+                        */
+                       for (el_idx=0; el_idx < req_msg->num_elements; el_idx++) {
+                               if (req_msg->elements[el_idx].flags & LDB_FLAG_MOD_DELETE
+                                   || ((req_msg->elements[el_idx].flags & LDB_FLAG_MOD_REPLACE) &&
+                                       req_msg->elements[el_idx].num_values == 0)) {
+                                       if (ldb_msg_find_element(modify_msg,
+                                                                req_msg->elements[el_idx].name) != NULL) {
+                                               continue;
+                                       }
+                                       pret = ldb_msg_add_empty(
+                                               modify_msg,
+                                               req_msg->elements[el_idx].name,
+                                               LDB_FLAG_MOD_REPLACE,
+                                               NULL);
+                                       if (pret != LDB_SUCCESS) {
+                                               return pret;
+                                       }
+                               }
+                       }
+               }
+
+               pret = dsdb_module_modify(data->partitions[i]->module,
+                                         modify_msg,
+                                         DSDB_FLAG_NEXT_MODULE,
+                                         req);
+
+               if (pret != LDB_SUCCESS) {
+                       return pret;
+               }
+       }
+
+       return ldb_module_done(req, NULL, NULL, LDB_SUCCESS);
+}
+
+
+/*
+ * @brief call back function for the ldb operations on special DN's.
+ *
+ * As the LDB operations are async, and we wish to use the result
+ * the operations, a callback needs to be registered to process the results
+ * of the LDB operations.
+ *
+ * @param req the ldb request
+ * @param res the result of the operation
+ *
+ * @return the LDB_STATUS
+ */
+static int partition_copy_all_callback_handler(
+       struct ldb_request *req,
+       struct ldb_reply *ares)
+{
+       struct partition_copy_context *ac = NULL;
+
+       ac = talloc_get_type(
+               req->context,
+               struct partition_copy_context);
+
+       if (!ares) {
+               return ldb_module_done(
+                       ac->request,
+                       NULL,
+                       NULL,
+                       LDB_ERR_OPERATIONS_ERROR);
+       }
+
+       /* pass on to the callback */
+       switch (ares->type) {
+       case LDB_REPLY_ENTRY:
+               return ldb_module_send_entry(
+                       ac->request,
+                       ares->message,
+                       ares->controls);
+
+       case LDB_REPLY_REFERRAL:
+               return ldb_module_send_referral(
+                       ac->request,
+                       ares->referral);
+
+       case LDB_REPLY_DONE: {
+               int error = ares->error;
+               if (error == LDB_SUCCESS) {
+                       error = partition_copy_all_callback_action(
+                               ac->module,
+                               ac->partition_context,
+                               ac->request,
+                               ac->dn);
+               }
+               return ldb_module_done(
+                       ac->request,
+                       ares->controls,
+                       ares->response,
+                       error);
+       }
+
+       default:
+               /* Can't happen */
+               return LDB_ERR_OPERATIONS_ERROR;
+       }
+}
+
+/**
+ * send an operation to the top partition, then copy the resulting
+ * object to all other partitions.
+ */
+static int partition_copy_all(
+       struct ldb_module *module,
+       struct partition_context *partition_context,
+       struct ldb_request *req,
+       struct ldb_dn *dn)
+{
+       struct ldb_request *new_req = NULL;
+       struct ldb_context *ldb = NULL;
+       struct partition_copy_context *context = NULL;
+
+       int ret;
+
+       ldb = ldb_module_get_ctx(module);
+
+       context = talloc_zero(req, struct partition_copy_context);
+       if (context == NULL) {
+               return ldb_oom(ldb);
+       }
+       context->module = module;
+       context->request = req;
+       context->dn = dn;
+       context->partition_context = partition_context;
+
+       switch (req->operation) {
+       case LDB_ADD:
+               ret = ldb_build_add_req(
+                       &new_req,
+                       ldb,
+                       req,
+                       req->op.add.message,
+                       req->controls,
+                       context,
+                       partition_copy_all_callback_handler,
+                       req);
+               break;
+       case LDB_MODIFY:
+               ret = ldb_build_mod_req(
+                       &new_req,
+                       ldb,
+                       req,
+                       req->op.mod.message,
+                       req->controls,
+                       context,
+                       partition_copy_all_callback_handler,
+                       req);
+               break;
+       case LDB_DELETE:
+               ret = ldb_build_del_req(
+                       &new_req,
+                       ldb,
+                       req,
+                       req->op.del.dn,
+                       req->controls,
+                       context,
+                       partition_copy_all_callback_handler,
+                       req);
+               break;
+       case LDB_RENAME:
+               ret = ldb_build_rename_req(
+                       &new_req,
+                       ldb,
+                       req,
+                       req->op.rename.olddn,
+                       req->op.rename.newdn,
+                       req->controls,
+                       context,
+                       partition_copy_all_callback_handler,
+                       req);
+               break;
+       default:
+               /*
+                * Shouldn't happen.
+                */
+               ldb_debug(
+                       ldb,
+                       LDB_DEBUG_ERROR,
+                       "Unexpected operation type (%d)\n", req->operation);
+               ret = LDB_ERR_OPERATIONS_ERROR;
+               break;
+       }
+       if (ret != LDB_SUCCESS) {
+               return ret;
+       }
+       return ldb_next_request(module, new_req);
+}
 /**
  * Figure out which backend a request needs to be aimed at.  Some
  * requests must be replicated to all backends
@@ -405,16 +685,18 @@ static int partition_send_all(struct ldb_module *module,
 static int partition_replicate(struct ldb_module *module, struct ldb_request *req, struct ldb_dn *dn) 
 {
        struct partition_context *ac;
-       unsigned i;
+       unsigned int i;
        int ret;
        struct dsdb_partition *partition;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
+
+       /* if we aren't initialised yet go further */
        if (!data || !data->partitions) {
                return ldb_next_request(module, req);
        }
 
-       if (req->operation != LDB_SEARCH) {
+       if (ldb_dn_is_special(dn)) {
                /* Is this a special DN, we need to replicate to every backend? */
                for (i=0; data->replicate && data->replicate[i]; i++) {
                        if (ldb_dn_compare(data->replicate[i], 
@@ -422,10 +704,10 @@ static int partition_replicate(struct ldb_module *module, struct ldb_request *re
                                
                                ac = partition_init_ctx(module, req);
                                if (!ac) {
-                                       return LDB_ERR_OPERATIONS_ERROR;
+                                       return ldb_operr(ldb_module_get_ctx(module));
                                }
                                
-                               return partition_send_all(module, ac, req);
+                               return partition_copy_all(module, ac, req, dn);
                        }
                }
        }
@@ -448,7 +730,7 @@ static int partition_replicate(struct ldb_module *module, struct ldb_request *re
 
        ac = partition_init_ctx(module, req);
        if (!ac) {
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_operr(ldb_module_get_ctx(module));
        }
 
        /* we need to add a control but we never touch the original request */
@@ -464,141 +746,221 @@ static int partition_replicate(struct ldb_module *module, struct ldb_request *re
 /* search */
 static int partition_search(struct ldb_module *module, struct ldb_request *req)
 {
-       int ret;
-       struct ldb_control **saved_controls;
        /* Find backend */
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
-
-       /* issue request */
-
-       /* (later) consider if we should be searching multiple
-        * partitions (for 'invisible' partition behaviour */
+       struct partition_context *ac;
+       struct ldb_context *ldb;
+       struct loadparm_context *lp_ctx;
 
        struct ldb_control *search_control = ldb_request_get_control(req, LDB_CONTROL_SEARCH_OPTIONS_OID);
        struct ldb_control *domain_scope_control = ldb_request_get_control(req, LDB_CONTROL_DOMAIN_SCOPE_OID);
+       struct ldb_control *no_gc_control = ldb_request_get_control(req, DSDB_CONTROL_NO_GLOBAL_CATALOG);
        
        struct ldb_search_options_control *search_options = NULL;
        struct dsdb_partition *p;
-       
-       ret = partition_reload_if_required(module, data);
-       if (ret != LDB_SUCCESS) {
-               return ret;
-       }
+       unsigned int i, j;
+       int ret;
+       bool domain_scope = false, phantom_root = false;
 
        p = find_partition(data, NULL, req);
        if (p != NULL) {
                /* the caller specified what partition they want the
                 * search - just pass it on
                 */
-               return ldb_next_request(p->module, req);                
+               return ldb_next_request(p->module, req);
        }
 
-
+       /* Get back the search options from the search control, and mark it as
+        * non-critical (to make backends and also dcpromo happy).
+        */
        if (search_control) {
                search_options = talloc_get_type(search_control->data, struct ldb_search_options_control);
+               search_control->critical = 0;
+
        }
 
-       /* Remove the domain_scope control, so we don't confuse a backend server */
-       if (domain_scope_control && !save_controls(domain_scope_control, req, &saved_controls)) {
-               ldb_oom(ldb_module_get_ctx(module));
-               return LDB_ERR_OPERATIONS_ERROR;
+       /* if we aren't initialised yet go further */
+       if (!data || !data->partitions) {
+               return ldb_next_request(module, req);
        }
 
-       /*
-        * for now pass down the LDB_CONTROL_SEARCH_OPTIONS_OID control
-        * down as uncritical to make windows 2008 dcpromo happy.
-        */
-       if (search_control) {
-               search_control->critical = 0;
+       /* Special DNs without specified partition should go further */
+       if (ldb_dn_is_special(req->op.search.base)) {
+               return ldb_next_request(module, req);
        }
 
-       /* TODO:
-          Generate referrals (look for a partition under this DN) if we don't have the above control specified
-       */
-       
-       if (search_options && (search_options->search_options & LDB_SEARCH_OPTION_PHANTOM_ROOT)) {
-               int i;
-               struct partition_context *ac;
-               if ((search_options->search_options & ~LDB_SEARCH_OPTION_PHANTOM_ROOT) == 0) {
-                       /* We have processed this flag, so we are done with this control now */
-
-                       /* Remove search control, so we don't confuse a backend server */
-                       if (search_control && !save_controls(search_control, req, &saved_controls)) {
-                               ldb_oom(ldb_module_get_ctx(module));
-                               return LDB_ERR_OPERATIONS_ERROR;
-                       }
-               }
-               ac = partition_init_ctx(module, req);
-               if (!ac) {
-                       return LDB_ERR_OPERATIONS_ERROR;
+       /* Locate the options */
+       domain_scope = (search_options
+               && (search_options->search_options & LDB_SEARCH_OPTION_DOMAIN_SCOPE))
+               || domain_scope_control;
+       phantom_root = search_options
+               && (search_options->search_options & LDB_SEARCH_OPTION_PHANTOM_ROOT);
+
+       /* Remove handled options from the search control flag */
+       if (search_options) {
+               search_options->search_options = search_options->search_options
+                       & ~LDB_SEARCH_OPTION_DOMAIN_SCOPE
+                       & ~LDB_SEARCH_OPTION_PHANTOM_ROOT;
+       }
+
+       ac = partition_init_ctx(module, req);
+       if (!ac) {
+               return ldb_operr(ldb_module_get_ctx(module));
+       }
+
+       ldb = ldb_module_get_ctx(ac->module);
+       lp_ctx = talloc_get_type(ldb_get_opaque(ldb, "loadparm"),
+                                               struct loadparm_context);
+
+       /* Search from the base DN */
+       if (ldb_dn_is_null(req->op.search.base)) {
+               if (!phantom_root) {
+                       return ldb_error(ldb, LDB_ERR_NO_SUCH_OBJECT, "empty base DN");
                }
+               return partition_send_all(module, ac, req);
+       }
 
-               /* Search from the base DN */
-               if (!req->op.search.base || ldb_dn_is_null(req->op.search.base)) {
-                       return partition_send_all(module, ac, req);
+       for (i=0; data->partitions[i]; i++) {
+               bool match = false, stop = false;
+
+               if (data->partitions[i]->partial_replica && no_gc_control != NULL) {
+                       if (ldb_dn_compare_base(data->partitions[i]->ctrl->dn,
+                                               req->op.search.base) == 0) {
+                               /* base DN is in a partial replica
+                                  with the NO_GLOBAL_CATALOG
+                                  control. This partition is invisible */
+                               /* DEBUG(0,("DENYING NON-GC OP: %s\n", ldb_module_call_chain(req, req))); */
+                               continue;
+                       }
                }
-               for (i=0; data && data->partitions && data->partitions[i]; i++) {
-                       bool match = false, stop = false;
-                       /* Find all partitions under the search base 
-                          
-                          we match if:
-
-                             1) the DN we are looking for exactly matches the partition
-                            or
-                             2) the DN we are looking for is a parent of the partition and it isn't
-                                 a scope base search
-                             or
-                             3) the DN we are looking for is a child of the partition
+
+               if (phantom_root) {
+                       /* Phantom root: Find all partitions under the
+                        * search base. We match if:
+                        *
+                        * 1) the DN we are looking for exactly matches a
+                        *    certain partition and always stop
+                        * 2) the DN we are looking for is a parent of certain
+                        *    partitions and it isn't a scope base search
+                        * 3) the DN we are looking for is a child of a certain
+                        *    partition and always stop
+                        *    - we don't need to go any further up in the
+                        *    hierarchy!
                         */
-                       if (ldb_dn_compare(data->partitions[i]->ctrl->dn, req->op.search.base) == 0) {
+                       if (ldb_dn_compare(data->partitions[i]->ctrl->dn,
+                                          req->op.search.base) == 0) {
                                match = true;
-                               if (req->op.search.scope == LDB_SCOPE_BASE) {
-                                       stop = true;
-                               }
+                               stop = true;
                        }
-                       if (!match && 
-                           (ldb_dn_compare_base(req->op.search.base, data->partitions[i]->ctrl->dn) == 0 &&
+                       if (!match &&
+                           (ldb_dn_compare_base(req->op.search.base,
+                                                data->partitions[i]->ctrl->dn) == 0 &&
                             req->op.search.scope != LDB_SCOPE_BASE)) {
                                match = true;
                        }
                        if (!match &&
-                           ldb_dn_compare_base(data->partitions[i]->ctrl->dn, req->op.search.base) == 0) {
+                           ldb_dn_compare_base(data->partitions[i]->ctrl->dn,
+                                               req->op.search.base) == 0) {
                                match = true;
                                stop = true; /* note that this relies on partition ordering */
                        }
-                       if (match) {
-                               ret = partition_prep_request(ac, data->partitions[i]);
-                               if (ret != LDB_SUCCESS) {
-                                       return ret;
+               } else {
+                       /* Domain scope: Find all partitions under the search
+                        * base.
+                        *
+                        * We generate referral candidates if we haven't
+                        * specified the domain scope control, haven't a base
+                        * search* scope and the DN we are looking for is a real
+                        * predecessor of certain partitions. When a new
+                        * referral candidate is nearer to the DN than an
+                        * existing one delete the latter (we want to have only
+                        * the closest ones). When we checked this for all
+                        * candidates we have the final referrals.
+                        *
+                        * We match if the DN we are looking for is a child of
+                        * a certain partition or the partition
+                        * DN itself - we don't need to go any further
+                        * up in the hierarchy!
+                        */
+                       if ((!domain_scope) &&
+                           (req->op.search.scope != LDB_SCOPE_BASE) &&
+                           (ldb_dn_compare_base(req->op.search.base,
+                                                data->partitions[i]->ctrl->dn) == 0) &&
+                           (ldb_dn_compare(req->op.search.base,
+                                           data->partitions[i]->ctrl->dn) != 0)) {
+                               const char *scheme = ldb_get_opaque(
+                                   ldb, LDAP_REFERRAL_SCHEME_OPAQUE);
+                               char *ref = talloc_asprintf(
+                                       ac,
+                                       "%s://%s/%s%s",
+                                       scheme == NULL ? "ldap" : scheme,
+                                       lpcfg_dnsdomain(lp_ctx),
+                                       ldb_dn_get_linearized(
+                                           data->partitions[i]->ctrl->dn),
+                                       req->op.search.scope ==
+                                           LDB_SCOPE_ONELEVEL ? "??base" : "");
+
+                               if (ref == NULL) {
+                                       return ldb_oom(ldb);
                                }
-                       }
-                       if (stop) break;
-               }
 
-               /* Perhaps we didn't match any partitions.  Try the main partition, only */
-               if (ac->num_requests == 0) {
-                       talloc_free(ac);
-                       return ldb_next_request(module, req);
-               }
+                               /* Initialise the referrals list */
+                               if (ac->referrals == NULL) {
+                                       char **l = str_list_make_empty(ac);
+                                       ac->referrals = discard_const_p(const char *, l);
+                                       if (ac->referrals == NULL) {
+                                               return ldb_oom(ldb);
+                                       }
+                               }
+
+                               /* Check if the new referral candidate is
+                                * closer to the base DN than already
+                                * saved ones and delete the latters */
+                               j = 0;
+                               while (ac->referrals[j] != NULL) {
+                                       if (strstr(ac->referrals[j],
+                                                  ldb_dn_get_linearized(data->partitions[i]->ctrl->dn)) != NULL) {
+                                               str_list_remove(ac->referrals,
+                                                               ac->referrals[j]);
+                                       } else {
+                                               ++j;
+                                       }
+                               }
 
-               /* fire the first one */
-               return partition_call_first(ac);
+                               /* Add our new candidate */
+                               ac->referrals = str_list_add(ac->referrals, ref);
 
-       } else {
-               /* Handle this like all other requests */
-               if (search_control && (search_options->search_options & ~LDB_SEARCH_OPTION_PHANTOM_ROOT) == 0) {
-                       /* We have processed this flag, so we are done with this control now */
-
-                       /* Remove search control, so we don't confuse a backend server */
-                       if (search_control && !save_controls(search_control, req, &saved_controls)) {
-                               ldb_oom(ldb_module_get_ctx(module));
-                               return LDB_ERR_OPERATIONS_ERROR;
+                               talloc_free(ref);
+
+                               if (ac->referrals == NULL) {
+                                       return ldb_oom(ldb);
+                               }
+                       }
+                       if (ldb_dn_compare_base(data->partitions[i]->ctrl->dn, req->op.search.base) == 0) {
+                               match = true;
+                               stop = true; /* note that this relies on partition ordering */
                        }
                }
 
-               return partition_replicate(module, req, req->op.search.base);
+               if (match) {
+                       ret = partition_prep_request(ac, data->partitions[i]);
+                       if (ret != LDB_SUCCESS) {
+                               return ret;
+                       }
+               }
+
+               if (stop) break;
        }
+
+       /* Perhaps we didn't match any partitions. Try the main partition */
+       if (ac->num_requests == 0) {
+               talloc_free(ac);
+               return ldb_next_request(module, req);
+       }
+
+       /* fire the first one */
+       return partition_call_first(ac);
 }
 
 /* add */
@@ -625,12 +987,12 @@ static int partition_rename(struct ldb_module *module, struct ldb_request *req)
        /* Find backend */
        struct dsdb_partition *backend, *backend2;
        
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
 
        /* Skip the lot if 'data' isn't here yet (initialisation) */
        if (!data) {
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_operr(ldb_module_get_ctx(module));
        }
 
        backend = find_partition(data, req->op.rename.olddn, req);
@@ -655,278 +1017,337 @@ static int partition_rename(struct ldb_module *module, struct ldb_request *req)
 }
 
 /* start a transaction */
-static int partition_start_trans(struct ldb_module *module)
+int partition_start_trans(struct ldb_module *module)
 {
-       int i, ret;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       int i = 0;
+       int ret = 0;
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
        /* Look at base DN */
        /* Figure out which partition it is under */
        /* Skip the lot if 'data' isn't here yet (initialization) */
+       if (ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING) {
+               ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_start_trans() -> (metadata partition)");
+       }
+
+       /*
+        * We start a transaction on metadata.tdb first and end it last in
+        * end_trans. This makes locking semantics follow TDB rather than MDB,
+        * and effectively locks all partitions at once.
+        * Detail:
+        * Samba AD is special in that the partitions module (this file)
+        * combines multiple independently locked databases into one overall
+        * transaction. Changes across multiple partition DBs in a single
+        * transaction must ALL be either visible or invisible.
+        * The way this is achieved is by taking out a write lock on
+        * metadata.tdb at the start of prepare_commit, while unlocking it at
+        * the end of end_trans. This is matched by read_lock, ensuring it
+        * can't progress until that write lock is released.
+        *
+        * metadata.tdb needs to be a TDB file because MDB uses independent
+        * locks, which means a read lock and a write lock can be held at the
+        * same time, whereas in TDB, the two locks block each other. The TDB
+        * behaviour is required to implement the functionality described
+        * above.
+        *
+        * An important additional detail here is that if prepare_commit is
+        * called on a TDB without any changes being made, no write lock is
+        * taken. We address this by storing a sequence number in metadata.tdb
+        * which is updated every time a replicated attribute is modified.
+        * The possibility of a few unreplicated attributes being out of date
+        * turns out not to be a problem.
+        * For this reason, a lock on sam.ldb (which is a TDB) won't achieve
+        * the same end as locking metadata.tdb, unless we made a modification
+        * to the @ records found there before every prepare_commit.
+        */
+       ret = partition_metadata_start_trans(module);
+       if (ret != LDB_SUCCESS) {
+               return ret;
+       }
+
        ret = ldb_next_start_trans(module);
        if (ret != LDB_SUCCESS) {
+               partition_metadata_del_trans(module);
                return ret;
        }
 
-       ret = partition_reload_if_required(module, data);
+       ret = partition_reload_if_required(module, data, NULL);
        if (ret != LDB_SUCCESS) {
+               ldb_next_del_trans(module);
+               partition_metadata_del_trans(module);
                return ret;
        }
 
+       /*
+        * The following per partition locks are required mostly because TDB
+        * and MDB require locks before read and write ops are permitted.
+        */
        for (i=0; data && data->partitions && data->partitions[i]; i++) {
-               struct ldb_module *next = data->partitions[i]->module;
-               PARTITION_FIND_OP(next, start_transaction);
-
-               ret = next->ops->start_transaction(next);
+               if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) {
+                       ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_start_trans() -> %s",
+                                 ldb_dn_get_linearized(data->partitions[i]->ctrl->dn));
+               }
+               ret = ldb_next_start_trans(data->partitions[i]->module);
                if (ret != LDB_SUCCESS) {
                        /* Back it out, if it fails on one */
                        for (i--; i >= 0; i--) {
-                               next = data->partitions[i]->module;
-                               PARTITION_FIND_OP(next, del_transaction);
-
-                               next->ops->del_transaction(next);
+                               ldb_next_del_trans(data->partitions[i]->module);
                        }
                        ldb_next_del_trans(module);
+                       partition_metadata_del_trans(module);
                        return ret;
                }
        }
+
+       data->in_transaction++;
+
        return LDB_SUCCESS;
 }
 
 /* prepare for a commit */
-static int partition_prepare_commit(struct ldb_module *module)
+int partition_prepare_commit(struct ldb_module *module)
 {
-       int i;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       unsigned int i;
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
+       int ret;
 
-       for (i=0; data && data->partitions && data->partitions[i]; i++) {
-               struct ldb_module *next_prepare = data->partitions[i]->module;
-               int ret;
+       /*
+        * Order of prepare_commit calls must match that in
+        * partition_start_trans. See comment in that function for detail.
+        */
+       ret = partition_metadata_prepare_commit(module);
+       if (ret != LDB_SUCCESS) {
+               return ret;
+       }
 
-               PARTITION_FIND_OP_NOERROR(next_prepare, prepare_commit);
-               if (next_prepare == NULL) {
-                       continue;
-               }
+       ret = ldb_next_prepare_commit(module);
+       if (ret != LDB_SUCCESS) {
+               return ret;
+       }
 
-               ret = next_prepare->ops->prepare_commit(next_prepare);
+       for (i=0; data && data->partitions && data->partitions[i]; i++) {
+               if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) {
+                       ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_prepare_commit() -> %s",
+                                 ldb_dn_get_linearized(data->partitions[i]->ctrl->dn));
+               }
+               ret = ldb_next_prepare_commit(data->partitions[i]->module);
                if (ret != LDB_SUCCESS) {
+                       ldb_asprintf_errstring(ldb_module_get_ctx(module), "prepare_commit error on %s: %s",
+                                              ldb_dn_get_linearized(data->partitions[i]->ctrl->dn),
+                                              ldb_errstring(ldb_module_get_ctx(module)));
                        return ret;
                }
        }
 
-       return ldb_next_prepare_commit(module);
+       if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) {
+               ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_prepare_commit() -> (metadata partition)");
+       }
+
+       return LDB_SUCCESS;
 }
 
 
 /* end a transaction */
-static int partition_end_trans(struct ldb_module *module)
+int partition_end_trans(struct ldb_module *module)
 {
+       int ret, ret2;
        int i;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       struct ldb_context *ldb = ldb_module_get_ctx(module);
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
-       for (i=0; data && data->partitions && data->partitions[i]; i++) {
-               struct ldb_module *next_end = data->partitions[i]->module;
-               int ret;
+       bool trace = module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING;
 
-               PARTITION_FIND_OP(next_end, end_transaction);
+       ret = LDB_SUCCESS;
 
-               ret = next_end->ops->end_transaction(next_end);
-               if (ret != LDB_SUCCESS) {
-                       return ret;
+       if (data->in_transaction == 0) {
+               DEBUG(0,("partition end transaction mismatch\n"));
+               ret = LDB_ERR_OPERATIONS_ERROR;
+       } else {
+               data->in_transaction--;
+       }
+
+       /*
+        * Order of end_trans calls must be the reverse of that in
+        * partition_start_trans. See comment in that function for detail.
+        */
+       if (data && data->partitions) {
+               /* Just counting the partitions */
+               for (i=0; data->partitions[i]; i++) {}
+
+               /* now walk them backwards */
+               for (i--; i>=0; i--) {
+                       struct dsdb_partition *p = data->partitions[i];
+                       if (trace) {
+                               ldb_debug(ldb,
+                                         LDB_DEBUG_TRACE,
+                                         "partition_end_trans() -> %s",
+                                         ldb_dn_get_linearized(p->ctrl->dn));
+                       }
+                       ret2 = ldb_next_end_trans(p->module);
+                       if (ret2 != LDB_SUCCESS) {
+                               ldb_asprintf_errstring(ldb,
+                                       "end_trans error on %s: %s",
+                                       ldb_dn_get_linearized(p->ctrl->dn),
+                                       ldb_errstring(ldb));
+                               ret = ret2;
+                       }
                }
        }
 
-       return ldb_next_end_trans(module);
+       if (trace) {
+               ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_end_trans() -> (metadata partition)");
+       }
+       ret2 = ldb_next_end_trans(module);
+       if (ret2 != LDB_SUCCESS) {
+               ret = ret2;
+       }
+
+       ret2 = partition_metadata_end_trans(module);
+       if (ret2 != LDB_SUCCESS) {
+               ret = ret2;
+       }
+
+       return ret;
 }
 
 /* delete a transaction */
-static int partition_del_trans(struct ldb_module *module)
+int partition_del_trans(struct ldb_module *module)
 {
-       int i, ret, final_ret = LDB_SUCCESS;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       int ret, final_ret = LDB_SUCCESS;
+       int i;
+       struct ldb_context *ldb = ldb_module_get_ctx(module);
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
-       for (i=0; data && data->partitions && data->partitions[i]; i++) {
-               struct ldb_module *next = data->partitions[i]->module;
-               PARTITION_FIND_OP(next, del_transaction);
+       bool trace = module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING;
 
-               ret = next->ops->del_transaction(next);
-               if (ret != LDB_SUCCESS) {
-                       final_ret = ret;
+       if (data == NULL) {
+               DEBUG(0,("partion delete transaction with no private data\n"));
+               return ldb_operr(ldb);
+       }
+
+       /*
+        * Order of del_trans calls must be the reverse of that in
+        * partition_start_trans. See comment in that function for detail.
+        */
+       if (data->partitions) {
+               /* Just counting the partitions */
+               for (i=0; data->partitions[i]; i++) {}
+
+               /* now walk them backwards */
+               for (i--; i>=0; i--) {
+                       struct dsdb_partition *p = data->partitions[i];
+                       if (trace) {
+                               ldb_debug(ldb,
+                                         LDB_DEBUG_TRACE,
+                                         "partition_del_trans() -> %s",
+                                         ldb_dn_get_linearized(p->ctrl->dn));
+                       }
+                       ret = ldb_next_del_trans(p->module);
+                       if (ret != LDB_SUCCESS) {
+                               ldb_asprintf_errstring(ldb,
+                                       "del_trans error on %s: %s",
+                                       ldb_dn_get_linearized(p->ctrl->dn),
+                                       ldb_errstring(ldb));
+                               final_ret = ret;
+                       }
                }
-       }       
+       }
 
+       if (trace) {
+               ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_del_trans() -> (metadata partition)");
+       }
        ret = ldb_next_del_trans(module);
        if (ret != LDB_SUCCESS) {
                final_ret = ret;
        }
+
+       ret = partition_metadata_del_trans(module);
+       if (ret != LDB_SUCCESS) {
+               final_ret = ret;
+       }
+
+       if (data->in_transaction == 0) {
+               DEBUG(0,("partition del transaction mismatch\n"));
+               return ldb_operr(ldb_module_get_ctx(module));
+       }
+       data->in_transaction--;
+
        return final_ret;
 }
 
 int partition_primary_sequence_number(struct ldb_module *module, TALLOC_CTX *mem_ctx, 
-                                    enum ldb_sequence_type type, uint64_t *seq_number) 
+                                     uint64_t *seq_number,
+                                     struct ldb_request *parent)
 {
        int ret;
        struct ldb_result *res;
        struct ldb_seqnum_request *tseq;
-       struct ldb_request *treq;
        struct ldb_seqnum_result *seqr;
-       res = talloc_zero(mem_ctx, struct ldb_result);
-       if (res == NULL) {
-               return LDB_ERR_OPERATIONS_ERROR;
-       }
-       tseq = talloc_zero(res, struct ldb_seqnum_request);
+
+       tseq = talloc_zero(mem_ctx, struct ldb_seqnum_request);
        if (tseq == NULL) {
-               talloc_free(res);
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_oom(ldb_module_get_ctx(module));
        }
-       tseq->type = type;
+       tseq->type = LDB_SEQ_HIGHEST_SEQ;
        
-       ret = ldb_build_extended_req(&treq, ldb_module_get_ctx(module), res,
-                                    LDB_EXTENDED_SEQUENCE_NUMBER,
-                                    tseq,
-                                    NULL,
-                                    res,
-                                    ldb_extended_default_callback,
-                                    NULL);
+       ret = dsdb_module_extended(module, tseq, &res,
+                                  LDB_EXTENDED_SEQUENCE_NUMBER,
+                                  tseq,
+                                  DSDB_FLAG_NEXT_MODULE,
+                                  parent);
        if (ret != LDB_SUCCESS) {
-               talloc_free(res);
+               talloc_free(tseq);
                return ret;
        }
        
-       ret = ldb_next_request(module, treq);
-       if (ret != LDB_SUCCESS) {
-               talloc_free(res);
-               return ret;
-       }
-       ret = ldb_wait(treq->handle, LDB_WAIT_ALL);
-       if (ret != LDB_SUCCESS) {
-               talloc_free(res);
-               return ret;
-       }
-       
-       seqr = talloc_get_type(res->extended->data,
-                              struct ldb_seqnum_result);
+       seqr = talloc_get_type_abort(res->extended->data,
+                                    struct ldb_seqnum_result);
        if (seqr->flags & LDB_SEQ_TIMESTAMP_SEQUENCE) {
-               ret = LDB_ERR_OPERATIONS_ERROR;
-               ldb_set_errstring(ldb_module_get_ctx(module), "Primary backend in partitions module returned a timestamp based seq number (must return a normal number)");
                talloc_free(res);
-               return ret;
-       } else {
-               *seq_number = seqr->seq_num;
+               return ldb_module_error(module, LDB_ERR_OPERATIONS_ERROR,
+                       "Primary backend in partition module returned a timestamp based seq");
        }
-       talloc_free(res);
+
+       *seq_number = seqr->seq_num;
+       talloc_free(tseq);
        return LDB_SUCCESS;
 }
 
-/* FIXME: This function is still semi-async */
-static int partition_sequence_number(struct ldb_module *module, struct ldb_request *req)
+
+/*
+ * Older version of sequence number as sum of sequence numbers for each partition
+ */
+int partition_sequence_number_from_partitions(struct ldb_module *module,
+                                             uint64_t *seqr)
 {
-       int i, ret;
+       int ret;
+       unsigned int i;
        uint64_t seq_number = 0;
-       uint64_t timestamp_sequence = 0;
-       uint64_t timestamp = 0;
-       struct partition_private_data *data = talloc_get_type(module->private_data, 
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
                                                              struct partition_private_data);
-       struct ldb_seqnum_request *seq;
-       struct ldb_seqnum_result *seqr;
-       struct ldb_request *treq;
-       struct ldb_seqnum_request *tseq;
-       struct ldb_seqnum_result *tseqr;
-       struct ldb_extended *ext;
-       struct ldb_result *res;
-       struct dsdb_partition *p;
 
-       p = find_partition(data, NULL, req);
-       if (p != NULL) {
-               /* the caller specified what partition they want the
-                * sequence number operation on - just pass it on
-                */
-               return ldb_next_request(p->module, req);                
+       ret = partition_primary_sequence_number(module, data, &seq_number, NULL);
+       if (ret != LDB_SUCCESS) {
+               return ret;
        }
-
-       seq = talloc_get_type(req->op.extended.data, struct ldb_seqnum_request);
-
-       switch (seq->type) {
-       case LDB_SEQ_NEXT:
-       case LDB_SEQ_HIGHEST_SEQ:
-
-               ret = partition_primary_sequence_number(module, req, seq->type, &seq_number);
-               if (ret != LDB_SUCCESS) {
-                       return ret;
-               }
-
-               /* Skip the lot if 'data' isn't here yet (initialisation) */
-               for (i=0; data && data->partitions && data->partitions[i]; i++) {
-
-                       res = talloc_zero(req, struct ldb_result);
-                       if (res == NULL) {
-                               return LDB_ERR_OPERATIONS_ERROR;
-                       }
-                       tseq = talloc_zero(res, struct ldb_seqnum_request);
-                       if (tseq == NULL) {
-                               talloc_free(res);
-                               return LDB_ERR_OPERATIONS_ERROR;
-                       }
-                       tseq->type = seq->type;
-
-                       ret = ldb_build_extended_req(&treq, ldb_module_get_ctx(module), res,
-                                                    LDB_EXTENDED_SEQUENCE_NUMBER,
-                                                    tseq,
-                                                    NULL,
-                                                    res,
-                                                    ldb_extended_default_callback,
-                                                    NULL);
-                       if (ret != LDB_SUCCESS) {
-                               talloc_free(res);
-                               return ret;
-                       }
-
-                       if (!ldb_request_get_control(treq, DSDB_CONTROL_CURRENT_PARTITION_OID)) {
-                               ret = ldb_request_add_control(treq,
-                                                             DSDB_CONTROL_CURRENT_PARTITION_OID,
-                                                             false, data->partitions[i]->ctrl);
-                               if (ret != LDB_SUCCESS) {
-                                       talloc_free(res);
-                                       return ret;
-                               }
-                       }
-
-                       ret = partition_request(data->partitions[i]->module, treq);
-                       if (ret != LDB_SUCCESS) {
-                               talloc_free(res);
-                               return ret;
-                       }
-                       ret = ldb_wait(treq->handle, LDB_WAIT_ALL);
-                       if (ret != LDB_SUCCESS) {
-                               talloc_free(res);
-                               return ret;
-                       }
-                       tseqr = talloc_get_type(res->extended->data,
-                                               struct ldb_seqnum_result);
-                       if (tseqr->flags & LDB_SEQ_TIMESTAMP_SEQUENCE) {
-                               timestamp_sequence = MAX(timestamp_sequence,
-                                                        tseqr->seq_num);
-                       } else {
-                               seq_number += tseqr->seq_num;
-                       }
-                       talloc_free(res);
-               }
-               /* fall through */
-       case LDB_SEQ_HIGHEST_TIMESTAMP:
-
-               res = talloc_zero(req, struct ldb_result);
+       
+       /* Skip the lot if 'data' isn't here yet (initialisation) */
+       for (i=0; data && data->partitions && data->partitions[i]; i++) {
+               struct ldb_seqnum_request *tseq;
+               struct ldb_seqnum_result *tseqr;
+               struct ldb_request *treq;
+               struct ldb_result *res = talloc_zero(data, struct ldb_result);
                if (res == NULL) {
-                       return LDB_ERR_OPERATIONS_ERROR;
+                       return ldb_oom(ldb_module_get_ctx(module));
                }
-
                tseq = talloc_zero(res, struct ldb_seqnum_request);
                if (tseq == NULL) {
                        talloc_free(res);
-                       return LDB_ERR_OPERATIONS_ERROR;
+                       return ldb_oom(ldb_module_get_ctx(module));
                }
-               tseq->type = LDB_SEQ_HIGHEST_TIMESTAMP;
-
+               tseq->type = LDB_SEQ_HIGHEST_SEQ;
+               
                ret = ldb_build_extended_req(&treq, ldb_module_get_ctx(module), res,
                                             LDB_EXTENDED_SEQUENCE_NUMBER,
                                             tseq,
@@ -934,12 +1355,13 @@ static int partition_sequence_number(struct ldb_module *module, struct ldb_reque
                                             res,
                                             ldb_extended_default_callback,
                                             NULL);
+               LDB_REQ_SET_LOCATION(treq);
                if (ret != LDB_SUCCESS) {
                        talloc_free(res);
                        return ret;
                }
-
-               ret = ldb_next_request(module, treq);
+               
+               ret = partition_request(data->partitions[i]->module, treq);
                if (ret != LDB_SUCCESS) {
                        talloc_free(res);
                        return ret;
@@ -949,175 +1371,307 @@ static int partition_sequence_number(struct ldb_module *module, struct ldb_reque
                        talloc_free(res);
                        return ret;
                }
-
                tseqr = talloc_get_type(res->extended->data,
-                                          struct ldb_seqnum_result);
-               timestamp = tseqr->seq_num;
-
+                                       struct ldb_seqnum_result);
+               seq_number += tseqr->seq_num;
                talloc_free(res);
+       }
 
-               /* Skip the lot if 'data' isn't here yet (initialisation) */
-               for (i=0; data && data->partitions && data->partitions[i]; i++) {
-
-                       res = talloc_zero(req, struct ldb_result);
-                       if (res == NULL) {
-                               return LDB_ERR_OPERATIONS_ERROR;
-                       }
-
-                       tseq = talloc_zero(res, struct ldb_seqnum_request);
-                       if (tseq == NULL) {
-                               talloc_free(res);
-                               return LDB_ERR_OPERATIONS_ERROR;
-                       }
-                       tseq->type = LDB_SEQ_HIGHEST_TIMESTAMP;
-
-                       ret = ldb_build_extended_req(&treq, ldb_module_get_ctx(module), res,
-                                                    LDB_EXTENDED_SEQUENCE_NUMBER,
-                                                    tseq,
-                                                    NULL,
-                                                    res,
-                                                    ldb_extended_default_callback,
-                                                    NULL);
-                       if (ret != LDB_SUCCESS) {
-                               talloc_free(res);
-                               return ret;
-                       }
-
-                       if (!ldb_request_get_control(treq, DSDB_CONTROL_CURRENT_PARTITION_OID)) {
-                               ret = ldb_request_add_control(treq,
-                                                             DSDB_CONTROL_CURRENT_PARTITION_OID,
-                                                             false, data->partitions[i]->ctrl);
-                               if (ret != LDB_SUCCESS) {
-                                       talloc_free(res);
-                                       return ret;
-                               }
-                       }
+       *seqr = seq_number;
+       return LDB_SUCCESS;
+}
 
-                       ret = partition_request(data->partitions[i]->module, treq);
-                       if (ret != LDB_SUCCESS) {
-                               talloc_free(res);
-                               return ret;
-                       }
-                       ret = ldb_wait(treq->handle, LDB_WAIT_ALL);
-                       if (ret != LDB_SUCCESS) {
-                               talloc_free(res);
-                               return ret;
-                       }
 
-                       tseqr = talloc_get_type(res->extended->data,
-                                                 struct ldb_seqnum_result);
-                       timestamp = MAX(timestamp, tseqr->seq_num);
+/*
+ * Newer version of sequence number using metadata tdb
+ */
+static int partition_sequence_number(struct ldb_module *module, struct ldb_request *req)
+{
+       struct ldb_extended *ext;
+       struct ldb_seqnum_request *seq;
+       struct ldb_seqnum_result *seqr;
+       uint64_t seq_number;
+       int ret;
 
-                       talloc_free(res);
+       seq = talloc_get_type_abort(req->op.extended.data, struct ldb_seqnum_request);
+       switch (seq->type) {
+       case LDB_SEQ_NEXT:
+               ret = partition_metadata_sequence_number_increment(module, &seq_number);
+               if (ret != LDB_SUCCESS) {
+                       return ret;
                }
+               break;
 
+       case LDB_SEQ_HIGHEST_SEQ:
+               ret = partition_metadata_sequence_number(module, &seq_number);
+               if (ret != LDB_SUCCESS) {
+                       return ret;
+               }
                break;
+
+       case LDB_SEQ_HIGHEST_TIMESTAMP:
+               return ldb_module_error(module, LDB_ERR_OPERATIONS_ERROR,
+                                       "LDB_SEQ_HIGHEST_TIMESTAMP not supported");
        }
 
        ext = talloc_zero(req, struct ldb_extended);
        if (!ext) {
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_module_oom(module);
        }
        seqr = talloc_zero(ext, struct ldb_seqnum_result);
        if (seqr == NULL) {
                talloc_free(ext);
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_module_oom(module);
        }
        ext->oid = LDB_EXTENDED_SEQUENCE_NUMBER;
        ext->data = seqr;
 
-       switch (seq->type) {
-       case LDB_SEQ_NEXT:
-       case LDB_SEQ_HIGHEST_SEQ:
+       seqr->seq_num = seq_number;
+       seqr->flags |= LDB_SEQ_GLOBAL_SEQUENCE;
 
-               /* Has someone above set a timebase sequence? */
-               if (timestamp_sequence) {
-                       seqr->seq_num = (((unsigned long long)timestamp << 24) | (seq_number & 0xFFFFFF));
-               } else {
-                       seqr->seq_num = seq_number;
-               }
+       /* send request done */
+       return ldb_module_done(req, NULL, ext, LDB_SUCCESS);
+}
+
+/* lock all the backends */
+int partition_read_lock(struct ldb_module *module)
+{
+       int i = 0;
+       int ret = 0;
+       int ret2 = 0;
+       struct ldb_context *ldb = ldb_module_get_ctx(module);
+       struct partition_private_data *data = \
+               talloc_get_type(ldb_module_get_private(module),
+                               struct partition_private_data);
+
+       if (ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING) {
+               ldb_debug(ldb, LDB_DEBUG_TRACE,
+                         "partition_read_lock() -> (metadata partition)");
+       }
+
+       /*
+        * It is important to only do this for LOCK because:
+        * - we don't want to unlock what we did not lock
+        *
+        * - we don't want to make a new lock on the sam.ldb
+        *   (triggered inside this routine due to the seq num check)
+        *   during an unlock phase as that will violate the lock
+        *   ordering
+        */
+
+       if (data == NULL) {
+               TALLOC_CTX *mem_ctx = talloc_new(module);
 
-               if (timestamp_sequence > seqr->seq_num) {
-                       seqr->seq_num = timestamp_sequence;
-                       seqr->flags |= LDB_SEQ_TIMESTAMP_SEQUENCE;
+               data = talloc_zero(mem_ctx, struct partition_private_data);
+               if (data == NULL) {
+                       talloc_free(mem_ctx);
+                       return ldb_operr(ldb);
                }
 
-               seqr->flags |= LDB_SEQ_GLOBAL_SEQUENCE;
-               break;
-       case LDB_SEQ_HIGHEST_TIMESTAMP:
-               seqr->seq_num = timestamp;
-               break;
+               /*
+                * When used from Samba4, this message is set by the
+                * samba4 module, as a fixed value not read from the
+                * DB.  This avoids listing modules in the DB
+                */
+               data->forced_module_msg = talloc_get_type(
+                       ldb_get_opaque(ldb,
+                                      DSDB_OPAQUE_PARTITION_MODULE_MSG_OPAQUE_NAME),
+                       struct ldb_message);
+
+               ldb_module_set_private(module, talloc_steal(module,
+                                                           data));
+               talloc_free(mem_ctx);
        }
 
-       if (seq->type == LDB_SEQ_NEXT) {
-               seqr->seq_num++;
+       /*
+        * This will lock sam.ldb and will also call event loops,
+        * so we do it before we get the whole db lock.
+        */
+       ret = partition_reload_if_required(module, data, NULL);
+       if (ret != LDB_SUCCESS) {
+               return ret;
        }
 
-       /* send request done */
-       return ldb_module_done(req, NULL, ext, LDB_SUCCESS);
-}
+       /*
+        * Order of read_lock calls must match that in partition_start_trans.
+        * See comment in that function for detail.
+        */
+       ret = partition_metadata_read_lock(module);
+       if (ret != LDB_SUCCESS) {
+               goto failed;
+       }
 
-static int partition_extended_schema_update_now(struct ldb_module *module, struct ldb_request *req)
-{
-       struct dsdb_partition *partition;
-       struct partition_private_data *data;
-       struct ldb_dn *schema_dn;
-       struct partition_context *ac;
-       int ret;
+       /*
+        * The top level DB (sam.ldb) lock is not enough to block another
+        * process in prepare_commit(), because if nothing was changed in the
+        * specific backend, then prepare_commit() is a no-op. Therefore the
+        * metadata.tdb lock is taken out above, as it is the best we can do
+        * right now.
+        */
+       ret = ldb_next_read_lock(module);
+       if (ret != LDB_SUCCESS) {
+               ldb_debug_set(ldb,
+                             LDB_DEBUG_FATAL,
+                             "Failed to lock db: %s / %s for metadata partition",
+                             ldb_errstring(ldb),
+                             ldb_strerror(ret));
 
-       schema_dn = talloc_get_type(req->op.extended.data, struct ldb_dn);
-       if (!schema_dn) {
-               ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_FATAL, "partition_extended: invalid extended data\n");
-               return LDB_ERR_PROTOCOL_ERROR;
+               return ret;
        }
 
-       data = talloc_get_type(module->private_data, struct partition_private_data);
-       if (!data) {
-               return LDB_ERR_OPERATIONS_ERROR;
+       /*
+        * The following per partition locks are required mostly because TDB
+        * and MDB require locks before reads are permitted.
+        */
+       for (i=0; data && data->partitions && data->partitions[i]; i++) {
+               if ((module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING)) {
+                       ldb_debug(ldb, LDB_DEBUG_TRACE,
+                                 "partition_read_lock() -> %s",
+                                 ldb_dn_get_linearized(
+                                         data->partitions[i]->ctrl->dn));
+               }
+               ret = ldb_next_read_lock(data->partitions[i]->module);
+               if (ret == LDB_SUCCESS) {
+                       continue;
+               }
+
+               ldb_debug_set(ldb,
+                             LDB_DEBUG_FATAL,
+                             "Failed to lock db: %s / %s for %s",
+                             ldb_errstring(ldb),
+                             ldb_strerror(ret),
+                             ldb_dn_get_linearized(
+                                     data->partitions[i]->ctrl->dn));
+
+               goto failed;
        }
-       
-       partition = find_partition( data, schema_dn, req);
-       if (!partition) {
-               return ldb_next_request(module, req);
+
+       return LDB_SUCCESS;
+
+failed:
+       /* Back it out, if it fails on one */
+       for (i--; i >= 0; i--) {
+               ret2 = ldb_next_read_unlock(data->partitions[i]->module);
+               if (ret2 != LDB_SUCCESS) {
+                       ldb_debug(ldb,
+                                 LDB_DEBUG_FATAL,
+                                 "Failed to unlock db: %s / %s",
+                                 ldb_errstring(ldb),
+                                 ldb_strerror(ret2));
+               }
        }
+       ret2 = ldb_next_read_unlock(module);
+       if (ret2 != LDB_SUCCESS) {
+               ldb_debug(ldb,
+                         LDB_DEBUG_FATAL,
+                         "Failed to unlock db: %s / %s",
+                         ldb_errstring(ldb),
+                         ldb_strerror(ret2));
+       }
+       return ret;
+}
 
-       ac = partition_init_ctx(module, req);
-       if (!ac) {
-               return LDB_ERR_OPERATIONS_ERROR;
+/* unlock all the backends */
+int partition_read_unlock(struct ldb_module *module)
+{
+       int i;
+       int ret = LDB_SUCCESS;
+       int ret2;
+       struct ldb_context *ldb = ldb_module_get_ctx(module);
+       struct partition_private_data *data = \
+               talloc_get_type(ldb_module_get_private(module),
+                               struct partition_private_data);
+       bool trace = module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING;
+
+       /*
+        * Order of read_unlock calls must be the reverse of that in
+        * partition_start_trans. See comment in that function for detail.
+        */
+       if (data && data->partitions) {
+               /* Just counting the partitions */
+               for (i=0; data->partitions[i]; i++) {}
+
+               /* now walk them backwards */
+               for (i--; i>=0; i--) {
+                       struct dsdb_partition *p = data->partitions[i];
+                       if (trace) {
+                               ldb_debug(ldb, LDB_DEBUG_TRACE,
+                                         "partition_read_unlock() -> %s",
+                                         ldb_dn_get_linearized(p->ctrl->dn));
+                       }
+                       ret2 = ldb_next_read_unlock(p->module);
+                       if (ret2 != LDB_SUCCESS) {
+                               ldb_debug_set(ldb,
+                                          LDB_DEBUG_FATAL,
+                                          "Failed to lock db: %s / %s for %s",
+                                          ldb_errstring(ldb),
+                                          ldb_strerror(ret),
+                                          ldb_dn_get_linearized(p->ctrl->dn));
+
+                               /*
+                                * Don't overwrite the original failure code
+                                * if there was one
+                                */
+                               if (ret == LDB_SUCCESS) {
+                                       ret = ret2;
+                               }
+                       }
+               }
        }
 
-       /* we need to add a control but we never touch the original request */
-       ret = partition_prep_request(ac, partition);
-       if (ret != LDB_SUCCESS) {
-               return ret;
+       if (trace) {
+               ldb_debug(ldb, LDB_DEBUG_TRACE,
+                         "partition_read_unlock() -> (metadata partition)");
        }
 
-       /* fire the first one */
-       ret = partition_call_first(ac);
+       ret2 = ldb_next_read_unlock(module);
+       if (ret2 != LDB_SUCCESS) {
+               ldb_debug_set(ldb,
+                             LDB_DEBUG_FATAL,
+                             "Failed to unlock db: %s / %s for metadata partition",
+                             ldb_errstring(ldb),
+                             ldb_strerror(ret2));
 
-       if (ret != LDB_SUCCESS){
-               return ret;
+               /*
+                * Don't overwrite the original failure code
+                * if there was one
+                */
+               if (ret == LDB_SUCCESS) {
+                       ret = ret2;
+               }
        }
 
-       return ldb_request_done(req, ret);
-}
+       ret = partition_metadata_read_unlock(module);
 
+       /*
+        * Don't overwrite the original failure code
+        * if there was one
+        */
+       if (ret == LDB_SUCCESS) {
+               ret = ret2;
+       }
+
+       return ret;
+}
 
 /* extended */
 static int partition_extended(struct ldb_module *module, struct ldb_request *req)
 {
-       struct partition_private_data *data;
+       struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
+                                                             struct partition_private_data);
        struct partition_context *ac;
        int ret;
 
-       data = talloc_get_type(module->private_data, struct partition_private_data);
+       /* if we aren't initialised yet go further */
        if (!data) {
                return ldb_next_request(module, req);
        }
 
-       ret = partition_reload_if_required(module, data);
-       if (ret != LDB_SUCCESS) {
-               return ret;
+       if (strcmp(req->op.extended.oid, DSDB_EXTENDED_SCHEMA_UPDATE_NOW_OID) == 0) {
+               /* Update the metadata.tdb to increment the schema version if needed*/
+               DEBUG(10, ("Incrementing the sequence_number after schema_update_now\n"));
+               ret = partition_metadata_inc_schema_sequence(module);
+               return ldb_module_done(req, NULL, NULL, ret);
        }
        
        if (strcmp(req->op.extended.oid, LDB_EXTENDED_SEQUENCE_NUMBER) == 0) {
@@ -1128,11 +1682,6 @@ static int partition_extended(struct ldb_module *module, struct ldb_request *req
                return partition_create(module, req);
        }
 
-       /* forward schemaUpdateNow operation to schema_fsmo module*/
-       if (strcmp(req->op.extended.oid, DSDB_EXTENDED_SCHEMA_UPDATE_NOW_OID) == 0) {
-               return partition_extended_schema_update_now( module, req );
-       }       
-
        /* 
         * as the extended operation has no dn
         * we need to send it to all partitions
@@ -1140,13 +1689,13 @@ static int partition_extended(struct ldb_module *module, struct ldb_request *req
 
        ac = partition_init_ctx(module, req);
        if (!ac) {
-               return LDB_ERR_OPERATIONS_ERROR;
+               return ldb_operr(ldb_module_get_ctx(module));
        }
 
        return partition_send_all(module, ac, req);
 }
 
-_PUBLIC_ const struct ldb_module_ops ldb_partition_module_ops = {
+static const struct ldb_module_ops ldb_partition_module_ops = {
        .name              = "partition",
        .init_context      = partition_init,
        .search            = partition_search,
@@ -1159,4 +1708,12 @@ _PUBLIC_ const struct ldb_module_ops ldb_partition_module_ops = {
        .prepare_commit    = partition_prepare_commit,
        .end_transaction   = partition_end_trans,
        .del_transaction   = partition_del_trans,
+       .read_lock         = partition_read_lock,
+       .read_unlock       = partition_read_unlock
 };
+
+int ldb_partition_module_init(const char *version)
+{
+       LDB_MODULE_CHECK_VERSION(version);
+       return ldb_register_module(&ldb_partition_module_ops);
+}