ctdb-locking: Update a comment
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_lock.c
index b313ed57358b0e2483f6ac7e981d8b3f8cff51d1..7d34830bc4c092f782dd4527a327de37418ed23f 100644 (file)
@@ -33,7 +33,7 @@
  * 2. Once the locks are obtained, signal parent process via fd.
  * 3. Invoke registered callback routine with locking status.
  * 4. If the child process cannot get locks within certain time,
- *    diagnose using /proc/locks and log warning message
+ *    execute an external script to debug.
  *
  * ctdb_lock_record()      - get a lock on a record
  * ctdb_lock_db()          - get a lock on a DB
@@ -43,9 +43,6 @@
  *  auto_mark              - whether to mark/unmark DBs in before/after callback
  */
 
-/* FIXME: Add a tunable max_lock_processes_per_db */
-#define MAX_LOCK_PROCESSES_PER_DB              (100)
-
 enum lock_type {
        LOCK_RECORD,
        LOCK_DB,
@@ -57,7 +54,7 @@ static const char * const lock_type_str[] = {
        "lock_record",
        "lock_db",
        "lock_alldb_prio",
-       "lock_db",
+       "lock_alldb",
 };
 
 struct lock_request;
@@ -71,19 +68,18 @@ struct lock_context {
        TDB_DATA key;
        uint32_t priority;
        bool auto_mark;
-       struct lock_request *req_queue;
+       struct lock_request *request;
        pid_t child;
        int fd[2];
        struct tevent_fd *tfd;
        struct tevent_timer *ttimer;
-       pid_t block_child;
-       int block_fd[2];
        struct timeval start_time;
+       uint32_t key_hash;
+       bool can_schedule;
 };
 
 /* lock_request is the client specific part for a lock request */
 struct lock_request {
-       struct lock_request *next, *prev;
        struct lock_context *lctx;
        void (*callback)(void *, bool);
        void *private_data;
@@ -241,7 +237,7 @@ static int db_lock_unmark_handler(struct ctdb_db_context *ctdb_db, uint32_t prio
 int ctdb_lockall_unmark_prio(struct ctdb_context *ctdb, uint32_t priority)
 {
        /*
-        * This function is only used by the main dameon during recovery.
+        * This function is only used by the main daemon during recovery.
         * At this stage, the databases have already been locked, by a
         * dedicated child process. The freeze_mode variable is used to track
         * whether the actual locks are held by the child process or not.
@@ -259,7 +255,7 @@ static int ctdb_lockall_unmark(struct ctdb_context *ctdb)
 {
        uint32_t priority;
 
-       for (priority=NUM_DB_PRIORITIES; priority>=0; priority--) {
+       for (priority=NUM_DB_PRIORITIES; priority>0; priority--) {
                if (ctdb_db_iterator(ctdb, priority, db_lock_unmark_handler, NULL) != 0) {
                        return -1;
                }
@@ -279,16 +275,17 @@ static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
        if (lock_ctx->child > 0) {
                ctdb_kill(lock_ctx->ctdb, lock_ctx->child, SIGKILL);
                DLIST_REMOVE(lock_ctx->ctdb->lock_current, lock_ctx);
-               lock_ctx->ctdb->lock_num_current--;
+               if (lock_ctx->ctdb_db) {
+                       lock_ctx->ctdb_db->lock_num_current--;
+               }
                CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current);
-               if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+               if (lock_ctx->ctdb_db) {
                        CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
                }
        } else {
                DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx);
-               lock_ctx->ctdb->lock_num_pending--;
                CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
-               if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+               if (lock_ctx->ctdb_db) {
                        CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                }
        }
@@ -304,11 +301,10 @@ static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
  */
 static int ctdb_lock_request_destructor(struct lock_request *lock_request)
 {
-       DLIST_REMOVE(lock_request->lctx->req_queue, lock_request);
+       lock_request->lctx->request = NULL;
        return 0;
 }
 
-
 void ctdb_lock_free_request_context(struct lock_request *lock_req)
 {
        struct lock_context *lock_ctx;
@@ -326,7 +322,7 @@ void ctdb_lock_free_request_context(struct lock_request *lock_req)
  */
 static void process_callbacks(struct lock_context *lock_ctx, bool locked)
 {
-       struct lock_request *request, *next;
+       struct lock_request *request;
 
        if (lock_ctx->auto_mark && locked) {
                switch (lock_ctx->type) {
@@ -348,19 +344,12 @@ static void process_callbacks(struct lock_context *lock_ctx, bool locked)
                }
        }
 
-       /* Iterate through all callbacks */
-       request = lock_ctx->req_queue;
-       while (request) {
-               if (lock_ctx->auto_mark) {
-                       /* Reset the destructor, so request is not removed from the list */
-                       talloc_set_destructor(request, NULL);
-               }
-
-               /* In case, callback frees the request, store next */
-               next = request->next;
-               request->callback(request->private_data, locked);
-               request = next;
+       request = lock_ctx->request;
+       if (lock_ctx->auto_mark) {
+               /* Reset the destructor, so request is not removed from the list */
+               talloc_set_destructor(request, NULL);
        }
+       request->callback(request->private_data, locked);
 
        if (lock_ctx->auto_mark && locked) {
                switch (lock_ctx->type) {
@@ -448,26 +437,25 @@ static void ctdb_lock_handler(struct tevent_context *ev,
        }
 
        /* Read the status from the child process */
-       read(lock_ctx->fd[0], &c, 1);
-       locked = (c == 0 ? true : false);
+       if (read(lock_ctx->fd[0], &c, 1) != 1) {
+               locked = false;
+       } else {
+               locked = (c == 0 ? true : false);
+       }
 
        /* Update statistics */
-       CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
        CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls);
        if (lock_ctx->ctdb_db) {
-               CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls);
        }
 
        if (locked) {
                if (lock_ctx->ctdb_db) {
-                       CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
                        CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.buckets[id]);
                        CTDB_UPDATE_LATENCY(lock_ctx->ctdb, lock_ctx->ctdb_db,
                                            lock_type_str[lock_ctx->type], locks.latency,
                                            lock_ctx->start_time);
 
-                       CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
                        CTDB_UPDATE_DB_LATENCY(lock_ctx->ctdb_db, lock_type_str[lock_ctx->type], locks.latency, t);
                        CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.buckets[id]);
                }
@@ -495,7 +483,7 @@ static void ctdb_lock_timeout_handler(struct tevent_context *ev,
                                    struct timeval current_time,
                                    void *private_data)
 {
-       const char *cmd = getenv("CTDB_DEBUG_LOCKS");
+       static const char * debug_locks = NULL;
        struct lock_context *lock_ctx;
        struct ctdb_context *ctdb;
        pid_t pid;
@@ -503,7 +491,7 @@ static void ctdb_lock_timeout_handler(struct tevent_context *ev,
        lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
        ctdb = lock_ctx->ctdb;
 
-       if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+       if (lock_ctx->ctdb_db) {
                DEBUG(DEBUG_WARNING,
                      ("Unable to get %s lock on database %s for %.0lf seconds\n",
                       (lock_ctx->type == LOCK_RECORD ? "RECORD" : "DB"),
@@ -515,12 +503,26 @@ static void ctdb_lock_timeout_handler(struct tevent_context *ev,
                       timeval_elapsed(&lock_ctx->start_time)));
        }
 
-       /* fire a child process to find the blocking process */
-       if (cmd != NULL) {
-               pid = fork();
+       /* Fire a child process to find the blocking process. */
+       if (debug_locks == NULL) {
+               debug_locks = getenv("CTDB_DEBUG_LOCKS");
+               if (debug_locks == NULL) {
+                       debug_locks = talloc_asprintf(ctdb,
+                                                     "%s/debug_locks.sh",
+                                                     getenv("CTDB_BASE"));
+               }
+       }
+       if (debug_locks != NULL) {
+               pid = vfork();
                if (pid == 0) {
-                       execl(cmd, cmd, NULL);
+                       execl(debug_locks, debug_locks, NULL);
+                       _exit(0);
                }
+               ctdb_track_child(ctdb, pid);
+       } else {
+               DEBUG(DEBUG_WARNING,
+                     (__location__
+                      " Unable to setup lock debugging - no memory?\n"));
        }
 
        /* reset the timeout timer */
@@ -648,59 +650,6 @@ static char **lock_helper_args(TALLOC_CTX *mem_ctx, struct lock_context *lock_ct
 }
 
 
-/*
- * Find the lock context of a given type
- */
-static struct lock_context *find_lock_context(struct lock_context *lock_list,
-                                             struct ctdb_db_context *ctdb_db,
-                                             TDB_DATA key,
-                                             uint32_t priority,
-                                             enum lock_type type)
-{
-       struct lock_context *lock_ctx;
-
-       /* Search active locks */
-       for (lock_ctx=lock_list; lock_ctx; lock_ctx=lock_ctx->next) {
-               if (lock_ctx->type != type) {
-                       continue;
-               }
-
-               switch (lock_ctx->type) {
-               case LOCK_RECORD:
-                       if (ctdb_db == lock_ctx->ctdb_db &&
-                           key.dsize == lock_ctx->key.dsize &&
-                           memcmp(key.dptr, lock_ctx->key.dptr, key.dsize) == 0) {
-                               goto done;
-                       }
-                       break;
-
-               case LOCK_DB:
-                       if (ctdb_db == lock_ctx->ctdb_db) {
-                               goto done;
-                       }
-                       break;
-
-               case LOCK_ALLDB_PRIO:
-                       if (priority == lock_ctx->priority) {
-                               goto done;
-                       }
-                       break;
-
-               case LOCK_ALLDB:
-                       goto done;
-                       break;
-               }
-       }
-
-       /* Did not find the lock context we are searching for */
-       lock_ctx = NULL;
-
-done:
-       return lock_ctx;
-
-}
-
-
 /*
  * Schedule a new lock child process
  * Set up callback handler and timeout handler
@@ -726,10 +675,6 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
                CTDB_NO_MEMORY_VOID(ctdb, prog);
        }
 
-       if (ctdb->lock_num_current >= MAX_LOCK_PROCESSES_PER_DB) {
-               return;
-       }
-
        if (ctdb->lock_pending == NULL) {
                return;
        }
@@ -737,22 +682,23 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
        /* Find a lock context with requests */
        lock_ctx = ctdb->lock_pending;
        while (lock_ctx != NULL) {
-               if (! lock_ctx->req_queue) {
-                       next_ctx = lock_ctx->next;
-                       DEBUG(DEBUG_INFO, ("Removing lock context without lock requests\n"));
+               next_ctx = lock_ctx->next;
+               if (! lock_ctx->request) {
+                       DEBUG(DEBUG_INFO, ("Removing lock context without lock request\n"));
                        DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
-                       ctdb->lock_num_pending--;
                        CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
                        if (lock_ctx->ctdb_db) {
                                CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
                        }
                        talloc_free(lock_ctx);
-                       lock_ctx = next_ctx;
-                       continue;
                } else {
-                       /* Found a lock context with lock requests */
-                       break;
+                       if (lock_ctx->ctdb_db == NULL ||
+                           lock_ctx->ctdb_db->lock_num_current < ctdb->tunable.lock_processes_per_db) {
+                               /* Found a lock context with lock requests */
+                               break;
+                       }
                }
+               lock_ctx = next_ctx;
        }
 
        if (lock_ctx == NULL) {
@@ -787,7 +733,7 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
                return;
        }
 
-       lock_ctx->child = ctdb_fork(ctdb);
+       lock_ctx->child = vfork();
 
        if (lock_ctx->child == (pid_t)-1) {
                DEBUG(DEBUG_ERR, ("Failed to create a child in ctdb_lock_schedule\n"));
@@ -809,6 +755,7 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
        }
 
        /* Parent process */
+       ctdb_track_child(ctdb, lock_ctx->child);
        close(lock_ctx->fd[1]);
 
        talloc_set_destructor(lock_ctx, ctdb_lock_context_destructor);
@@ -848,9 +795,14 @@ static void ctdb_lock_schedule(struct ctdb_context *ctdb)
 
        /* Move the context from pending to current */
        DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
-       ctdb->lock_num_pending--;
        DLIST_ADD_END(ctdb->lock_current, lock_ctx, NULL);
-       ctdb->lock_num_current++;
+       CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+       CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+       if (lock_ctx->ctdb_db) {
+               lock_ctx->ctdb_db->lock_num_current++;
+               CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+               CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+       }
 }
 
 
@@ -866,7 +818,7 @@ static struct lock_request *ctdb_lock_internal(struct ctdb_context *ctdb,
                                               enum lock_type type,
                                               bool auto_mark)
 {
-       struct lock_context *lock_ctx;
+       struct lock_context *lock_ctx = NULL;
        struct lock_request *request;
 
        if (callback == NULL) {
@@ -874,54 +826,59 @@ static struct lock_request *ctdb_lock_internal(struct ctdb_context *ctdb,
                return NULL;
        }
 
-       /* get a context for this key - search only the pending contexts,
-        * current contexts might in the middle of processing callbacks */
-       lock_ctx = find_lock_context(ctdb->lock_pending, ctdb_db, key, priority, type);
-
-       /* No existing context, create one */
+       lock_ctx = talloc_zero(ctdb, struct lock_context);
        if (lock_ctx == NULL) {
-               lock_ctx = talloc_zero(ctdb, struct lock_context);
-               if (lock_ctx == NULL) {
-                       DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
-                       return NULL;
-               }
+               DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
+               return NULL;
+       }
 
-               lock_ctx->type = type;
-               lock_ctx->ctdb = ctdb;
-               lock_ctx->ctdb_db = ctdb_db;
-               lock_ctx->key.dsize = key.dsize;
-               if (key.dsize > 0) {
-                       lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
-               } else {
-                       lock_ctx->key.dptr = NULL;
+       if ((request = talloc_zero(lock_ctx, struct lock_request)) == NULL) {
+               talloc_free(lock_ctx);
+               return NULL;
+       }
+
+       lock_ctx->type = type;
+       lock_ctx->ctdb = ctdb;
+       lock_ctx->ctdb_db = ctdb_db;
+       lock_ctx->key.dsize = key.dsize;
+       if (key.dsize > 0) {
+               lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
+               if (lock_ctx->key.dptr == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n"));
+                       talloc_free(lock_ctx);
+                       return NULL;
                }
-               lock_ctx->priority = priority;
-               lock_ctx->auto_mark = auto_mark;
+               lock_ctx->key_hash = ctdb_hash(&key);
+       } else {
+               lock_ctx->key.dptr = NULL;
+       }
+       lock_ctx->priority = priority;
+       lock_ctx->auto_mark = auto_mark;
 
-               lock_ctx->child = -1;
-               lock_ctx->block_child = -1;
+       lock_ctx->request = request;
+       lock_ctx->child = -1;
 
+       /* Non-record locks are required by recovery and should be scheduled
+        * immediately, so keep them at the head of the pending queue.
+        */
+       if (lock_ctx->type == LOCK_RECORD) {
                DLIST_ADD_END(ctdb->lock_pending, lock_ctx, NULL);
-               ctdb->lock_num_pending++;
-               CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
-               if (ctdb_db) {
-                       CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
-               }
-
-               /* Start the timer when we activate the context */
-               lock_ctx->start_time = timeval_current();
+       } else {
+               DLIST_ADD(ctdb->lock_pending, lock_ctx);
        }
-
-       if ((request = talloc_zero(lock_ctx, struct lock_request)) == NULL) {
-               return NULL;
+       CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
+       if (ctdb_db) {
+               CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
        }
 
+       /* Start the timer when we activate the context */
+       lock_ctx->start_time = timeval_current();
+
        request->lctx = lock_ctx;
        request->callback = callback;
        request->private_data = private_data;
 
        talloc_set_destructor(request, ctdb_lock_request_destructor);
-       DLIST_ADD_END(lock_ctx->req_queue, request, NULL);
 
        ctdb_lock_schedule(ctdb);
 
@@ -977,7 +934,7 @@ struct lock_request *ctdb_lock_alldb_prio(struct ctdb_context *ctdb,
                                          void (*callback)(void *, bool),
                                          void *private_data)
 {
-       if (priority < 0 || priority > NUM_DB_PRIORITIES) {
+       if (priority < 1 || priority > NUM_DB_PRIORITIES) {
                DEBUG(DEBUG_ERR, ("Invalid db priority: %u\n", priority));
                return NULL;
        }