ctdb-locking: Update a comment
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_lock.c
index d6beed86b51ca9543f9b21120a1140c43fafcaf6..7d34830bc4c092f782dd4527a327de37418ed23f 100644 (file)
@@ -33,7 +33,7 @@
  * 2. Once the locks are obtained, signal parent process via fd.
  * 3. Invoke registered callback routine with locking status.
  * 4. If the child process cannot get locks within certain time,
- *    diagnose using /proc/locks and log warning message
+ *    execute an external script to debug.
  *
  * ctdb_lock_record()      - get a lock on a record
  * ctdb_lock_db()          - get a lock on a DB
@@ -43,9 +43,6 @@
  *  auto_mark              - whether to mark/unmark DBs in before/after callback
  */
 
-/* FIXME: Add a tunable max_lock_processes_per_db */
-#define MAX_LOCK_PROCESSES_PER_DB              (100)
-
 enum lock_type {
        LOCK_RECORD,
        LOCK_DB,
@@ -71,18 +68,18 @@ struct lock_context {
        TDB_DATA key;
        uint32_t priority;
        bool auto_mark;
-       struct lock_request *req_queue;
+       struct lock_request *request;
        pid_t child;
        int fd[2];
        struct tevent_fd *tfd;
        struct tevent_timer *ttimer;
        struct timeval start_time;
        uint32_t key_hash;
+       bool can_schedule;
 };
 
 /* lock_request is the client specific part for a lock request */
 struct lock_request {
-       struct lock_request *next, *prev;
        struct lock_context *lctx;
        void (*callback)(void *, bool);
        void *private_data;
@@ -282,14 +279,13 @@ static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
                        lock_ctx->ctdb_db->lock_num_current--;
                }
                CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current);
-               if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+               if (lock_ctx->ctdb_db) {
                        CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
                }
        } else {
                DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx);
-               lock_ctx->ctdb->lock_num_pending--;
                CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
-               if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+               if (lock_ctx->ctdb_db) {
                        CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                }
        }
@@ -305,11 +301,10 @@ static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
  */
 static int ctdb_lock_request_destructor(struct lock_request *lock_request)
 {
-       DLIST_REMOVE(lock_request->lctx->req_queue, lock_request);
+       lock_request->lctx->request = NULL;
        return 0;
 }
 
-
 void ctdb_lock_free_request_context(struct lock_request *lock_req)
 {
        struct lock_context *lock_ctx;
@@ -327,7 +322,7 @@ void ctdb_lock_free_request_context(struct lock_request *lock_req)
  */
 static void process_callbacks(struct lock_context *lock_ctx, bool locked)
 {
-       struct lock_request *request, *next;
+       struct lock_request *request;
 
        if (lock_ctx->auto_mark && locked) {
                switch (lock_ctx->type) {
@@ -349,19 +344,12 @@ static void process_callbacks(struct lock_context *lock_ctx, bool locked)
                }
        }
 
-       /* Iterate through all callbacks */
-       request = lock_ctx->req_queue;
-       while (request) {
-               if (lock_ctx->auto_mark) {
-                       /* Reset the destructor, so request is not removed from the list */
-                       talloc_set_destructor(request, NULL);
-               }
-
-               /* In case, callback frees the request, store next */
-               next = request->next;
-               request->callback(request->private_data, locked);
-               request = next;
+       request = lock_ctx->request;
+       if (lock_ctx->auto_mark) {
+               /* Reset the destructor, so request is not removed from the list */
+               talloc_set_destructor(request, NULL);
        }
+       request->callback(request->private_data, locked);
 
        if (lock_ctx->auto_mark && locked) {
                switch (lock_ctx->type) {
@@ -456,10 +444,8 @@ static void ctdb_lock_handler(struct tevent_context *ev,
        }
 
        /* Update statistics */
-       CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
        CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls);
        if (lock_ctx->ctdb_db) {
-               CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls);
        }
 
@@ -505,7 +491,7 @@ static void ctdb_lock_timeout_handler(struct tevent_context *ev,
        lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
        ctdb = lock_ctx->ctdb;
 
-       if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+       if (lock_ctx->ctdb_db) {
                DEBUG(DEBUG_WARNING,
                      ("Unable to get %s lock on database %s for %.0lf seconds\n",
                       (lock_ctx->type == LOCK_RECORD ? "RECORD" : "DB"),
@@ -664,66 +650,13 @@ static char **lock_helper_args(TALLOC_CTX *mem_ctx, struct lock_context *lock_ct
 }
 
 
-/*
- * Find the lock context of a given type
- */
-static struct lock_context *find_lock_context(struct lock_context *lock_list,
-                                             struct ctdb_db_context *ctdb_db,
-                                             TDB_DATA key,
-                                             uint32_t priority,
-                                             enum lock_type type,
-                                             uint32_t key_hash)
-{
-       struct lock_context *lock_ctx;
-
-       /* Search active locks */
-       for (lock_ctx=lock_list; lock_ctx; lock_ctx=lock_ctx->next) {
-               if (lock_ctx->type != type) {
-                       continue;
-               }
-
-               switch (lock_ctx->type) {
-               case LOCK_RECORD:
-                       if (ctdb_db == lock_ctx->ctdb_db &&
-                           key_hash == lock_ctx->key_hash) {
-                               goto done;
-                       }
-                       break;
-
-               case LOCK_DB:
-                       if (ctdb_db == lock_ctx->ctdb_db) {
-                               goto done;
-                       }
-                       break;
-
-               case LOCK_ALLDB_PRIO:
-                       if (priority == lock_ctx->priority) {
-                               goto done;
-                       }
-                       break;
-
-               case LOCK_ALLDB:
-                       goto done;
-                       break;
-               }
-       }
-
-       /* Did not find the lock context we are searching for */
-       lock_ctx = NULL;
-
-done:
-       return lock_ctx;
-
-}
-
-
 /*
  * Schedule a new lock child process
  * Set up callback handler and timeout handler
  */
 static void ctdb_lock_schedule(struct ctdb_context *ctdb)
 {
-       struct lock_context *lock_ctx, *next_ctx, *active_ctx;
+       struct lock_context *lock_ctx, *next_ctx;
        int ret;
        TALLOC_CTX *tmp_ctx;
        const char *helper = BINDIR "/ctdb_lock_helper";
@@ -750,31 +683,20 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
        lock_ctx = ctdb->lock_pending;
        while (lock_ctx != NULL) {
                next_ctx = lock_ctx->next;
-               if (! lock_ctx->req_queue) {
-                       DEBUG(DEBUG_INFO, ("Removing lock context without lock requests\n"));
+               if (! lock_ctx->request) {
+                       DEBUG(DEBUG_INFO, ("Removing lock context without lock request\n"));
                        DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
-                       ctdb->lock_num_pending--;
                        CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
                        if (lock_ctx->ctdb_db) {
                                CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                        }
                        talloc_free(lock_ctx);
                } else {
-                       active_ctx = find_lock_context(ctdb->lock_current, lock_ctx->ctdb_db,
-                                                      lock_ctx->key, lock_ctx->priority,
-                                                      lock_ctx->type, lock_ctx->key_hash);
-                       if (active_ctx == NULL) {
-                               if (lock_ctx->ctdb_db == NULL ||
-                                   lock_ctx->ctdb_db->lock_num_current < MAX_LOCK_PROCESSES_PER_DB) {
-                                       /* Found a lock context with lock requests */
-                                       break;
-                               }
+                       if (lock_ctx->ctdb_db == NULL ||
+                           lock_ctx->ctdb_db->lock_num_current < ctdb->tunable.lock_processes_per_db) {
+                               /* Found a lock context with lock requests */
+                               break;
                        }
-
-                       /* There is already a child waiting for the
-                        * same key.  So don't schedule another child
-                        * just yet.
-                        */
                }
                lock_ctx = next_ctx;
        }
@@ -873,11 +795,12 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
 
        /* Move the context from pending to current */
        DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
-       ctdb->lock_num_pending--;
        DLIST_ADD_END(ctdb->lock_current, lock_ctx, NULL);
+       CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+       CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
        if (lock_ctx->ctdb_db) {
                lock_ctx->ctdb_db->lock_num_current++;
-               CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+               CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
        }
 }
@@ -903,65 +826,59 @@ static struct lock_request *ctdb_lock_internal(struct ctdb_context *ctdb,
                return NULL;
        }
 
-#if 0
-       /* Disable this optimization to ensure first-in-first-out fair
-        * scheduling of lock requests */
+       lock_ctx = talloc_zero(ctdb, struct lock_context);
+       if (lock_ctx == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
+               return NULL;
+       }
 
-       /* get a context for this key - search only the pending contexts,
-        * current contexts might in the middle of processing callbacks */
-       lock_ctx = find_lock_context(ctdb->lock_pending, ctdb_db, key, priority, type);
-#endif
+       if ((request = talloc_zero(lock_ctx, struct lock_request)) == NULL) {
+               talloc_free(lock_ctx);
+               return NULL;
+       }
 
-       /* No existing context, create one */
-       if (lock_ctx == NULL) {
-               lock_ctx = talloc_zero(ctdb, struct lock_context);
-               if (lock_ctx == NULL) {
-                       DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
+       lock_ctx->type = type;
+       lock_ctx->ctdb = ctdb;
+       lock_ctx->ctdb_db = ctdb_db;
+       lock_ctx->key.dsize = key.dsize;
+       if (key.dsize > 0) {
+               lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
+               if (lock_ctx->key.dptr == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n"));
+                       talloc_free(lock_ctx);
                        return NULL;
                }
+               lock_ctx->key_hash = ctdb_hash(&key);
+       } else {
+               lock_ctx->key.dptr = NULL;
+       }
+       lock_ctx->priority = priority;
+       lock_ctx->auto_mark = auto_mark;
 
-               lock_ctx->type = type;
-               lock_ctx->ctdb = ctdb;
-               lock_ctx->ctdb_db = ctdb_db;
-               lock_ctx->key.dsize = key.dsize;
-               if (key.dsize > 0) {
-                       lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
-                       if (lock_ctx->key.dptr == NULL) {
-                               DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n"));
-                               talloc_free(lock_ctx);
-                               return NULL;
-                       }
-                       lock_ctx->key_hash = ctdb_hash(&key);
-               } else {
-                       lock_ctx->key.dptr = NULL;
-               }
-               lock_ctx->priority = priority;
-               lock_ctx->auto_mark = auto_mark;
-
-               lock_ctx->child = -1;
+       lock_ctx->request = request;
+       lock_ctx->child = -1;
 
+       /* Non-record locks are required by recovery and should be scheduled
+        * immediately, so keep them at the head of the pending queue.
+        */
+       if (lock_ctx->type == LOCK_RECORD) {
                DLIST_ADD_END(ctdb->lock_pending, lock_ctx, NULL);
-               ctdb->lock_num_pending++;
-               CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
-               if (ctdb_db) {
-                       CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
-               }
-
-               /* Start the timer when we activate the context */
-               lock_ctx->start_time = timeval_current();
+       } else {
+               DLIST_ADD(ctdb->lock_pending, lock_ctx);
        }
-
-       if ((request = talloc_zero(lock_ctx, struct lock_request)) == NULL) {
-               talloc_free(lock_ctx);
-               return NULL;
+       CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
+       if (ctdb_db) {
+               CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
        }
 
+       /* Start the timer when we activate the context */
+       lock_ctx->start_time = timeval_current();
+
        request->lctx = lock_ctx;
        request->callback = callback;
        request->private_data = private_data;
 
        talloc_set_destructor(request, ctdb_lock_request_destructor);
-       DLIST_ADD_END(lock_ctx->req_queue, request, NULL);
 
        ctdb_lock_schedule(ctdb);