ctdb-locking: Remove unused priority argument from db_handler_t
[samba.git] / ctdb / server / ctdb_lock.c
1 /*
2    ctdb lock handling
3    provide API to do non-blocking locks for single or all databases
4
5    Copyright (C) Amitay Isaacs  2012
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "include/ctdb_private.h"
22 #include "include/ctdb_protocol.h"
23 #include "tevent.h"
24 #include "tdb.h"
25 #include "lib/tdb_wrap/tdb_wrap.h"
26 #include "system/filesys.h"
27 #include "lib/util/dlinklist.h"
28
29 /*
30  * Non-blocking Locking API
31  *
32  * 1. Create a child process to do blocking locks.
33  * 2. Once the locks are obtained, signal parent process via fd.
34  * 3. Invoke registered callback routine with locking status.
35  * 4. If the child process cannot get locks within certain time,
36  *    execute an external script to debug.
37  *
38  * ctdb_lock_record()      - get a lock on a record
39  * ctdb_lock_db()          - get a lock on a DB
40  * ctdb_lock_alldb_prio()  - get a lock on all DBs with given priority
41  * ctdb_lock_alldb()       - get a lock on all DBs
42  *
43  *  auto_mark              - whether to mark/unmark DBs in before/after callback
44  *                           = false is used for freezing databases for
45  *                           recovery since the recovery cannot start till
46  *                           databases are locked on all the nodes.
47  *                           = true is used for record locks.
48  */
49
50 enum lock_type {
51         LOCK_RECORD,
52         LOCK_DB,
53         LOCK_ALLDB_PRIO,
54         LOCK_ALLDB,
55 };
56
57 static const char * const lock_type_str[] = {
58         "lock_record",
59         "lock_db",
60         "lock_alldb_prio",
61         "lock_alldb",
62 };
63
64 struct lock_request;
65
66 /* lock_context is the common part for a lock request */
67 struct lock_context {
68         struct lock_context *next, *prev;
69         enum lock_type type;
70         struct ctdb_context *ctdb;
71         struct ctdb_db_context *ctdb_db;
72         TDB_DATA key;
73         uint32_t priority;
74         bool auto_mark;
75         struct lock_request *request;
76         pid_t child;
77         int fd[2];
78         struct tevent_fd *tfd;
79         struct tevent_timer *ttimer;
80         struct timeval start_time;
81         uint32_t key_hash;
82         bool can_schedule;
83 };
84
85 /* lock_request is the client specific part for a lock request */
86 struct lock_request {
87         struct lock_context *lctx;
88         void (*callback)(void *, bool);
89         void *private_data;
90 };
91
92
93 /*
94  * Support samba 3.6.x (and older) versions which do not set db priority.
95  *
96  * By default, all databases are set to priority 1. So only when priority
97  * is set to 1, check for databases that need higher priority.
98  */
99 static bool later_db(struct ctdb_context *ctdb, const char *name)
100 {
101         if (ctdb->tunable.samba3_hack == 0) {
102                 return false;
103         }
104
105         if (strstr(name, "brlock") ||
106             strstr(name, "g_lock") ||
107             strstr(name, "notify_onelevel") ||
108             strstr(name, "serverid") ||
109             strstr(name, "xattr_tdb")) {
110                 return true;
111         }
112
113         return false;
114 }
115
116 typedef int (*db_handler_t)(struct ctdb_db_context *ctdb_db,
117                             void *private_data);
118
119 static int ctdb_db_iterator(struct ctdb_context *ctdb, uint32_t priority,
120                             db_handler_t handler, void *private_data)
121 {
122         struct ctdb_db_context *ctdb_db;
123         int ret;
124
125         for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
126                 if (ctdb_db->priority != priority) {
127                         continue;
128                 }
129                 if (later_db(ctdb, ctdb_db->db_name)) {
130                         continue;
131                 }
132                 ret = handler(ctdb_db, private_data);
133                 if (ret != 0) {
134                         return -1;
135                 }
136         }
137
138         /* If priority != 1, later_db check is not required and can return */
139         if (priority != 1) {
140                 return 0;
141         }
142
143         for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
144                 if (!later_db(ctdb, ctdb_db->db_name)) {
145                         continue;
146                 }
147                 ret = handler(ctdb_db, private_data);
148                 if (ret != 0) {
149                         return -1;
150                 }
151         }
152
153         return 0;
154 }
155
156
157 /*
158  * lock all databases - mark only
159  */
160 static int db_lock_mark_handler(struct ctdb_db_context *ctdb_db,
161                                 void *private_data)
162 {
163         int tdb_transaction_write_lock_mark(struct tdb_context *);
164
165         DEBUG(DEBUG_INFO, ("marking locked database %s\n", ctdb_db->db_name));
166
167         if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
168                 DEBUG(DEBUG_ERR, ("Failed to mark (transaction lock) database %s\n",
169                                   ctdb_db->db_name));
170                 return -1;
171         }
172
173         if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
174                 DEBUG(DEBUG_ERR, ("Failed to mark (all lock) database %s\n",
175                                   ctdb_db->db_name));
176                 return -1;
177         }
178
179         return 0;
180 }
181
182 int ctdb_lockall_mark_prio(struct ctdb_context *ctdb, uint32_t priority)
183 {
184         /*
185          * This function is only used by the main dameon during recovery.
186          * At this stage, the databases have already been locked, by a
187          * dedicated child process. The freeze_mode variable is used to track
188          * whether the actual locks are held by the child process or not.
189          */
190
191         if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
192                 DEBUG(DEBUG_ERR, ("Attempt to mark all databases locked when not frozen\n"));
193                 return -1;
194         }
195
196         return ctdb_db_iterator(ctdb, priority, db_lock_mark_handler, NULL);
197 }
198
199 static int ctdb_lockall_mark(struct ctdb_context *ctdb)
200 {
201         uint32_t priority;
202
203         for (priority=1; priority<=NUM_DB_PRIORITIES; priority++) {
204                 if (ctdb_db_iterator(ctdb, priority, db_lock_mark_handler, NULL) != 0) {
205                         return -1;
206                 }
207         }
208
209         return 0;
210 }
211
212
213 /*
214  * lock all databases - unmark only
215  */
216 static int db_lock_unmark_handler(struct ctdb_db_context *ctdb_db,
217                                   void *private_data)
218 {
219         int tdb_transaction_write_lock_unmark(struct tdb_context *);
220
221         DEBUG(DEBUG_INFO, ("unmarking locked database %s\n", ctdb_db->db_name));
222
223         if (tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb) != 0) {
224                 DEBUG(DEBUG_ERR, ("Failed to unmark (transaction lock) database %s\n",
225                                   ctdb_db->db_name));
226                 return -1;
227         }
228
229         if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
230                 DEBUG(DEBUG_ERR, ("Failed to unmark (all lock) database %s\n",
231                                   ctdb_db->db_name));
232                 return -1;
233         }
234
235         return 0;
236 }
237
238 int ctdb_lockall_unmark_prio(struct ctdb_context *ctdb, uint32_t priority)
239 {
240         /*
241          * This function is only used by the main daemon during recovery.
242          * At this stage, the databases have already been locked, by a
243          * dedicated child process. The freeze_mode variable is used to track
244          * whether the actual locks are held by the child process or not.
245          */
246
247         if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
248                 DEBUG(DEBUG_ERR, ("Attempt to unmark all databases locked when not frozen\n"));
249                 return -1;
250         }
251
252         return ctdb_db_iterator(ctdb, priority, db_lock_unmark_handler, NULL);
253 }
254
255 static int ctdb_lockall_unmark(struct ctdb_context *ctdb)
256 {
257         uint32_t priority;
258
259         for (priority=NUM_DB_PRIORITIES; priority>0; priority--) {
260                 if (ctdb_db_iterator(ctdb, priority, db_lock_unmark_handler, NULL) != 0) {
261                         return -1;
262                 }
263         }
264
265         return 0;
266 }
267
268
269 static void ctdb_lock_schedule(struct ctdb_context *ctdb);
270
271 /*
272  * Destructor to kill the child locking process
273  */
274 static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
275 {
276         if (lock_ctx->request) {
277                 lock_ctx->request->lctx = NULL;
278         }
279         if (lock_ctx->child > 0) {
280                 ctdb_kill(lock_ctx->ctdb, lock_ctx->child, SIGKILL);
281                 if (lock_ctx->type == LOCK_RECORD) {
282                         DLIST_REMOVE(lock_ctx->ctdb_db->lock_current, lock_ctx);
283                 } else {
284                         DLIST_REMOVE(lock_ctx->ctdb->lock_current, lock_ctx);
285                 }
286                 if (lock_ctx->ctdb_db) {
287                         lock_ctx->ctdb_db->lock_num_current--;
288                 }
289                 CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current);
290                 if (lock_ctx->ctdb_db) {
291                         CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
292                 }
293         } else {
294                 if (lock_ctx->type == LOCK_RECORD) {
295                         DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx);
296                 } else {
297                         DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx);
298                 }
299                 CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
300                 if (lock_ctx->ctdb_db) {
301                         CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
302                 }
303         }
304
305         ctdb_lock_schedule(lock_ctx->ctdb);
306
307         return 0;
308 }
309
310
311 /*
312  * Destructor to remove lock request
313  */
314 static int ctdb_lock_request_destructor(struct lock_request *lock_request)
315 {
316         if (lock_request->lctx == NULL) {
317                 return 0;
318         }
319
320         lock_request->lctx->request = NULL;
321         TALLOC_FREE(lock_request->lctx);
322
323         return 0;
324 }
325
326 /*
327  * Process all the callbacks waiting for lock
328  *
329  * If lock has failed, callback is executed with locked=false
330  */
331 static void process_callbacks(struct lock_context *lock_ctx, bool locked)
332 {
333         struct lock_request *request;
334         bool auto_mark = lock_ctx->auto_mark;
335
336         if (auto_mark && locked) {
337                 switch (lock_ctx->type) {
338                 case LOCK_RECORD:
339                         tdb_chainlock_mark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
340                         break;
341
342                 case LOCK_DB:
343                         tdb_lockall_mark(lock_ctx->ctdb_db->ltdb->tdb);
344                         break;
345
346                 case LOCK_ALLDB_PRIO:
347                         ctdb_lockall_mark_prio(lock_ctx->ctdb, lock_ctx->priority);
348                         break;
349
350                 case LOCK_ALLDB:
351                         ctdb_lockall_mark(lock_ctx->ctdb);
352                         break;
353                 }
354         }
355
356         request = lock_ctx->request;
357         if (auto_mark) {
358                 /* Since request may be freed in the callback, unset the lock
359                  * context, so request destructor will not free lock context.
360                  */
361                 request->lctx = NULL;
362         }
363
364         /* Since request may be freed in the callback, unset the request */
365         lock_ctx->request = NULL;
366
367         request->callback(request->private_data, locked);
368
369         if (!auto_mark) {
370                 return;
371         }
372
373         if (locked) {
374                 switch (lock_ctx->type) {
375                 case LOCK_RECORD:
376                         tdb_chainlock_unmark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
377                         break;
378
379                 case LOCK_DB:
380                         tdb_lockall_unmark(lock_ctx->ctdb_db->ltdb->tdb);
381                         break;
382
383                 case LOCK_ALLDB_PRIO:
384                         ctdb_lockall_unmark_prio(lock_ctx->ctdb, lock_ctx->priority);
385                         break;
386
387                 case LOCK_ALLDB:
388                         ctdb_lockall_unmark(lock_ctx->ctdb);
389                         break;
390                 }
391         }
392
393         talloc_free(lock_ctx);
394 }
395
396
397 static int lock_bucket_id(double t)
398 {
399         double ms = 1.e-3, s = 1;
400         int id;
401
402         if (t < 1*ms) {
403                 id = 0;
404         } else if (t < 10*ms) {
405                 id = 1;
406         } else if (t < 100*ms) {
407                 id = 2;
408         } else if (t < 1*s) {
409                 id = 3;
410         } else if (t < 2*s) {
411                 id = 4;
412         } else if (t < 4*s) {
413                 id = 5;
414         } else if (t < 8*s) {
415                 id = 6;
416         } else if (t < 16*s) {
417                 id = 7;
418         } else if (t < 32*s) {
419                 id = 8;
420         } else if (t < 64*s) {
421                 id = 9;
422         } else {
423                 id = 10;
424         }
425
426         return id;
427 }
428
429 /*
430  * Callback routine when the required locks are obtained.
431  * Called from parent context
432  */
433 static void ctdb_lock_handler(struct tevent_context *ev,
434                             struct tevent_fd *tfd,
435                             uint16_t flags,
436                             void *private_data)
437 {
438         struct lock_context *lock_ctx;
439         char c;
440         bool locked;
441         double t;
442         int id;
443
444         lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
445
446         /* cancel the timeout event */
447         TALLOC_FREE(lock_ctx->ttimer);
448
449         t = timeval_elapsed(&lock_ctx->start_time);
450         id = lock_bucket_id(t);
451
452         /* Read the status from the child process */
453         if (sys_read(lock_ctx->fd[0], &c, 1) != 1) {
454                 locked = false;
455         } else {
456                 locked = (c == 0 ? true : false);
457         }
458
459         /* Update statistics */
460         CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls);
461         if (lock_ctx->ctdb_db) {
462                 CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls);
463         }
464
465         if (locked) {
466                 if (lock_ctx->ctdb_db) {
467                         CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.buckets[id]);
468                         CTDB_UPDATE_LATENCY(lock_ctx->ctdb, lock_ctx->ctdb_db,
469                                             lock_type_str[lock_ctx->type], locks.latency,
470                                             lock_ctx->start_time);
471
472                         CTDB_UPDATE_DB_LATENCY(lock_ctx->ctdb_db, lock_type_str[lock_ctx->type], locks.latency, t);
473                         CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.buckets[id]);
474                 }
475         } else {
476                 CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_failed);
477                 if (lock_ctx->ctdb_db) {
478                         CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_failed);
479                 }
480         }
481
482         process_callbacks(lock_ctx, locked);
483 }
484
485
486 /*
487  * Callback routine when required locks are not obtained within timeout
488  * Called from parent context
489  */
490 static void ctdb_lock_timeout_handler(struct tevent_context *ev,
491                                     struct tevent_timer *ttimer,
492                                     struct timeval current_time,
493                                     void *private_data)
494 {
495         static char debug_locks[PATH_MAX+1] = "";
496         struct lock_context *lock_ctx;
497         struct ctdb_context *ctdb;
498         pid_t pid;
499         double elapsed_time;
500         int new_timer;
501
502         lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
503         ctdb = lock_ctx->ctdb;
504
505         /* If a node stopped/banned, don't spam the logs */
506         if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
507                 lock_ctx->ttimer = NULL;
508                 return;
509         }
510
511         elapsed_time = timeval_elapsed(&lock_ctx->start_time);
512         if (lock_ctx->ctdb_db) {
513                 DEBUG(DEBUG_WARNING,
514                       ("Unable to get %s lock on database %s for %.0lf seconds\n",
515                        (lock_ctx->type == LOCK_RECORD ? "RECORD" : "DB"),
516                        lock_ctx->ctdb_db->db_name, elapsed_time));
517         } else {
518                 DEBUG(DEBUG_WARNING,
519                       ("Unable to get ALLDB locks for %.0lf seconds\n",
520                        elapsed_time));
521         }
522
523         if (ctdb_set_helper("lock debugging helper",
524                             debug_locks, sizeof(debug_locks),
525                             "CTDB_DEBUG_LOCKS",
526                             getenv("CTDB_BASE"), "debug_locks.sh")) {
527                 pid = vfork();
528                 if (pid == 0) {
529                         execl(debug_locks, debug_locks, NULL);
530                         _exit(0);
531                 }
532                 ctdb_track_child(ctdb, pid);
533         } else {
534                 DEBUG(DEBUG_WARNING,
535                       (__location__
536                        " Unable to setup lock debugging\n"));
537         }
538
539         /* Back-off logging if lock is not obtained for a long time */
540         if (elapsed_time < 100.0) {
541                 new_timer = 10;
542         } else if (elapsed_time < 1000.0) {
543                 new_timer = 100;
544         } else {
545                 new_timer = 1000;
546         }
547
548         /* reset the timeout timer */
549         // talloc_free(lock_ctx->ttimer);
550         lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
551                                             lock_ctx,
552                                             timeval_current_ofs(new_timer, 0),
553                                             ctdb_lock_timeout_handler,
554                                             (void *)lock_ctx);
555 }
556
557
558 static int db_count_handler(struct ctdb_db_context *ctdb_db, void *private_data)
559 {
560         int *count = (int *)private_data;
561
562         (*count) += 2;
563
564         return 0;
565 }
566
567 static int db_flags(struct ctdb_db_context *ctdb_db)
568 {
569         int tdb_flags = TDB_DEFAULT;
570
571 #ifdef TDB_MUTEX_LOCKING
572         if (!ctdb_db->persistent && ctdb_db->ctdb->tunable.mutex_enabled) {
573                 tdb_flags = (TDB_MUTEX_LOCKING | TDB_CLEAR_IF_FIRST);
574         }
575 #endif
576         return tdb_flags;
577 }
578
579 struct db_namelist {
580         const char **names;
581         int n;
582 };
583
584 static int db_name_handler(struct ctdb_db_context *ctdb_db, void *private_data)
585 {
586         struct db_namelist *list = (struct db_namelist *)private_data;
587
588         list->names[list->n] = talloc_strdup(list->names, ctdb_db->db_path);
589         list->names[list->n+1] = talloc_asprintf(list->names, "0x%x",
590                                                  db_flags(ctdb_db));
591         list->n += 2;
592
593         return 0;
594 }
595
596 static bool lock_helper_args(TALLOC_CTX *mem_ctx,
597                              struct lock_context *lock_ctx, int fd,
598                              int *argc, const char ***argv)
599 {
600         struct ctdb_context *ctdb = lock_ctx->ctdb;
601         const char **args = NULL;
602         int nargs, i;
603         int priority;
604         struct db_namelist list;
605
606         switch (lock_ctx->type) {
607         case LOCK_RECORD:
608                 nargs = 6;
609                 break;
610
611         case LOCK_DB:
612                 nargs = 5;
613                 break;
614
615         case LOCK_ALLDB_PRIO:
616                 nargs = 3;
617                 ctdb_db_iterator(ctdb, lock_ctx->priority, db_count_handler, &nargs);
618                 break;
619
620         case LOCK_ALLDB:
621                 nargs = 3;
622                 for (priority=1; priority<NUM_DB_PRIORITIES; priority++) {
623                         ctdb_db_iterator(ctdb, priority, db_count_handler, &nargs);
624                 }
625                 break;
626         }
627
628         /* Add extra argument for null termination */
629         nargs++;
630
631         args = talloc_array(mem_ctx, const char *, nargs);
632         if (args == NULL) {
633                 return false;
634         }
635
636         args[0] = talloc_asprintf(args, "%d", getpid());
637         args[1] = talloc_asprintf(args, "%d", fd);
638
639         switch (lock_ctx->type) {
640         case LOCK_RECORD:
641                 args[2] = talloc_strdup(args, "RECORD");
642                 args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
643                 args[4] = talloc_asprintf(args, "0x%x",
644                                           db_flags(lock_ctx->ctdb_db));
645                 if (lock_ctx->key.dsize == 0) {
646                         args[5] = talloc_strdup(args, "NULL");
647                 } else {
648                         args[5] = hex_encode_talloc(args, lock_ctx->key.dptr, lock_ctx->key.dsize);
649                 }
650                 break;
651
652         case LOCK_DB:
653                 args[2] = talloc_strdup(args, "DB");
654                 args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
655                 args[4] = talloc_asprintf(args, "0x%x",
656                                           db_flags(lock_ctx->ctdb_db));
657                 break;
658
659         case LOCK_ALLDB_PRIO:
660                 args[2] = talloc_strdup(args, "DB");
661                 list.names = args;
662                 list.n = 3;
663                 ctdb_db_iterator(ctdb, lock_ctx->priority, db_name_handler, &list);
664                 break;
665
666         case LOCK_ALLDB:
667                 args[2] = talloc_strdup(args, "DB");
668                 list.names = args;
669                 list.n = 3;
670                 for (priority=1; priority<NUM_DB_PRIORITIES; priority++) {
671                         ctdb_db_iterator(ctdb, priority, db_name_handler, &list);
672                 }
673                 break;
674         }
675
676         /* Make sure last argument is NULL */
677         args[nargs-1] = NULL;
678
679         for (i=0; i<nargs-1; i++) {
680                 if (args[i] == NULL) {
681                         talloc_free(args);
682                         return false;
683                 }
684         }
685
686         *argc = nargs;
687         *argv = args;
688         return true;
689 }
690
691 /*
692  * Find a lock request that can be scheduled
693  */
694 static struct lock_context *ctdb_find_lock_context(struct ctdb_context *ctdb)
695 {
696         struct lock_context *lock_ctx, *next_ctx;
697         struct ctdb_db_context *ctdb_db;
698
699         /* First check if there are database lock requests */
700
701         for (lock_ctx = ctdb->lock_pending; lock_ctx != NULL;
702              lock_ctx = next_ctx) {
703
704                 if (lock_ctx->request != NULL) {
705                         /* Found a lock context with a request */
706                         return lock_ctx;
707                 }
708
709                 next_ctx = lock_ctx->next;
710
711                 DEBUG(DEBUG_INFO, ("Removing lock context without lock "
712                                    "request\n"));
713                 DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
714                 CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
715                 if (lock_ctx->ctdb_db) {
716                         CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db,
717                                                locks.num_pending);
718                 }
719                 talloc_free(lock_ctx);
720         }
721
722         /* Next check database queues */
723         for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
724                 if (ctdb_db->lock_num_current ==
725                     ctdb->tunable.lock_processes_per_db) {
726                         continue;
727                 }
728
729                 for (lock_ctx = ctdb_db->lock_pending; lock_ctx != NULL;
730                      lock_ctx = next_ctx) {
731
732                         next_ctx = lock_ctx->next;
733
734                         if (lock_ctx->request != NULL) {
735                                 return lock_ctx;
736                         }
737
738                         DEBUG(DEBUG_INFO, ("Removing lock context without "
739                                            "lock request\n"));
740                         DLIST_REMOVE(ctdb_db->lock_pending, lock_ctx);
741                         CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
742                         CTDB_DECREMENT_DB_STAT(ctdb_db, locks.num_pending);
743                         talloc_free(lock_ctx);
744                 }
745         }
746
747         return NULL;
748 }
749
750 /*
751  * Schedule a new lock child process
752  * Set up callback handler and timeout handler
753  */
754 static void ctdb_lock_schedule(struct ctdb_context *ctdb)
755 {
756         struct lock_context *lock_ctx;
757         int ret, argc;
758         TALLOC_CTX *tmp_ctx;
759         static char prog[PATH_MAX+1] = "";
760         const char **args;
761
762         if (!ctdb_set_helper("lock helper",
763                              prog, sizeof(prog),
764                              "CTDB_LOCK_HELPER",
765                              CTDB_HELPER_BINDIR, "ctdb_lock_helper")) {
766                 ctdb_die(ctdb, __location__
767                          " Unable to set lock helper\n");
768         }
769
770         /* Find a lock context with requests */
771         lock_ctx = ctdb_find_lock_context(ctdb);
772         if (lock_ctx == NULL) {
773                 return;
774         }
775
776         lock_ctx->child = -1;
777         ret = pipe(lock_ctx->fd);
778         if (ret != 0) {
779                 DEBUG(DEBUG_ERR, ("Failed to create pipe in ctdb_lock_schedule\n"));
780                 return;
781         }
782
783         set_close_on_exec(lock_ctx->fd[0]);
784
785         /* Create data for child process */
786         tmp_ctx = talloc_new(lock_ctx);
787         if (tmp_ctx == NULL) {
788                 DEBUG(DEBUG_ERR, ("Failed to allocate memory for helper args\n"));
789                 close(lock_ctx->fd[0]);
790                 close(lock_ctx->fd[1]);
791                 return;
792         }
793
794         /* Create arguments for lock helper */
795         if (!lock_helper_args(tmp_ctx, lock_ctx, lock_ctx->fd[1],
796                               &argc, &args)) {
797                 DEBUG(DEBUG_ERR, ("Failed to create lock helper args\n"));
798                 close(lock_ctx->fd[0]);
799                 close(lock_ctx->fd[1]);
800                 talloc_free(tmp_ctx);
801                 return;
802         }
803
804         if (!ctdb_vfork_with_logging(lock_ctx, ctdb, "lock_helper",
805                                      prog, argc, (const char **)args,
806                                      NULL, NULL, &lock_ctx->child)) {
807                 DEBUG(DEBUG_ERR, ("Failed to create a child in ctdb_lock_schedule\n"));
808                 close(lock_ctx->fd[0]);
809                 close(lock_ctx->fd[1]);
810                 talloc_free(tmp_ctx);
811                 return;
812         }
813
814         /* Parent process */
815         close(lock_ctx->fd[1]);
816
817         talloc_free(tmp_ctx);
818
819         /* Set up timeout handler */
820         lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
821                                             lock_ctx,
822                                             timeval_current_ofs(10, 0),
823                                             ctdb_lock_timeout_handler,
824                                             (void *)lock_ctx);
825         if (lock_ctx->ttimer == NULL) {
826                 ctdb_kill(ctdb, lock_ctx->child, SIGKILL);
827                 lock_ctx->child = -1;
828                 close(lock_ctx->fd[0]);
829                 return;
830         }
831
832         /* Set up callback */
833         lock_ctx->tfd = tevent_add_fd(ctdb->ev,
834                                       lock_ctx,
835                                       lock_ctx->fd[0],
836                                       EVENT_FD_READ,
837                                       ctdb_lock_handler,
838                                       (void *)lock_ctx);
839         if (lock_ctx->tfd == NULL) {
840                 TALLOC_FREE(lock_ctx->ttimer);
841                 ctdb_kill(ctdb, lock_ctx->child, SIGKILL);
842                 lock_ctx->child = -1;
843                 close(lock_ctx->fd[0]);
844                 return;
845         }
846         tevent_fd_set_auto_close(lock_ctx->tfd);
847
848         /* Move the context from pending to current */
849         if (lock_ctx->type == LOCK_RECORD) {
850                 DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx);
851                 DLIST_ADD_END(lock_ctx->ctdb_db->lock_current, lock_ctx, NULL);
852         } else {
853                 DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
854                 DLIST_ADD_END(ctdb->lock_current, lock_ctx, NULL);
855         }
856         CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
857         CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
858         if (lock_ctx->ctdb_db) {
859                 lock_ctx->ctdb_db->lock_num_current++;
860                 CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
861                 CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
862         }
863 }
864
865
866 /*
867  * Lock record / db depending on type
868  */
869 static struct lock_request *ctdb_lock_internal(TALLOC_CTX *mem_ctx,
870                                                struct ctdb_context *ctdb,
871                                                struct ctdb_db_context *ctdb_db,
872                                                TDB_DATA key,
873                                                uint32_t priority,
874                                                void (*callback)(void *, bool),
875                                                void *private_data,
876                                                enum lock_type type,
877                                                bool auto_mark)
878 {
879         struct lock_context *lock_ctx = NULL;
880         struct lock_request *request;
881
882         if (callback == NULL) {
883                 DEBUG(DEBUG_WARNING, ("No callback function specified, not locking\n"));
884                 return NULL;
885         }
886
887         lock_ctx = talloc_zero(ctdb, struct lock_context);
888         if (lock_ctx == NULL) {
889                 DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
890                 return NULL;
891         }
892
893         if ((request = talloc_zero(mem_ctx, struct lock_request)) == NULL) {
894                 talloc_free(lock_ctx);
895                 return NULL;
896         }
897
898         lock_ctx->type = type;
899         lock_ctx->ctdb = ctdb;
900         lock_ctx->ctdb_db = ctdb_db;
901         lock_ctx->key.dsize = key.dsize;
902         if (key.dsize > 0) {
903                 lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
904                 if (lock_ctx->key.dptr == NULL) {
905                         DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n"));
906                         talloc_free(lock_ctx);
907                         talloc_free(request);
908                         return NULL;
909                 }
910                 lock_ctx->key_hash = ctdb_hash(&key);
911         } else {
912                 lock_ctx->key.dptr = NULL;
913         }
914         lock_ctx->priority = priority;
915         lock_ctx->auto_mark = auto_mark;
916
917         lock_ctx->request = request;
918         lock_ctx->child = -1;
919
920         /* Non-record locks are required by recovery and should be scheduled
921          * immediately, so keep them at the head of the pending queue.
922          */
923         if (lock_ctx->type == LOCK_RECORD) {
924                 DLIST_ADD_END(ctdb_db->lock_pending, lock_ctx, NULL);
925         } else {
926                 DLIST_ADD_END(ctdb->lock_pending, lock_ctx, NULL);
927         }
928         CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
929         if (ctdb_db) {
930                 CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
931         }
932
933         /* Start the timer when we activate the context */
934         lock_ctx->start_time = timeval_current();
935
936         request->lctx = lock_ctx;
937         request->callback = callback;
938         request->private_data = private_data;
939
940         talloc_set_destructor(request, ctdb_lock_request_destructor);
941         talloc_set_destructor(lock_ctx, ctdb_lock_context_destructor);
942
943         ctdb_lock_schedule(ctdb);
944
945         return request;
946 }
947
948
949 /*
950  * obtain a lock on a record in a database
951  */
952 struct lock_request *ctdb_lock_record(TALLOC_CTX *mem_ctx,
953                                       struct ctdb_db_context *ctdb_db,
954                                       TDB_DATA key,
955                                       bool auto_mark,
956                                       void (*callback)(void *, bool),
957                                       void *private_data)
958 {
959         return ctdb_lock_internal(mem_ctx,
960                                   ctdb_db->ctdb,
961                                   ctdb_db,
962                                   key,
963                                   0,
964                                   callback,
965                                   private_data,
966                                   LOCK_RECORD,
967                                   auto_mark);
968 }
969
970
971 /*
972  * obtain a lock on a database
973  */
974 struct lock_request *ctdb_lock_db(TALLOC_CTX *mem_ctx,
975                                   struct ctdb_db_context *ctdb_db,
976                                   bool auto_mark,
977                                   void (*callback)(void *, bool),
978                                   void *private_data)
979 {
980         return ctdb_lock_internal(mem_ctx,
981                                   ctdb_db->ctdb,
982                                   ctdb_db,
983                                   tdb_null,
984                                   0,
985                                   callback,
986                                   private_data,
987                                   LOCK_DB,
988                                   auto_mark);
989 }
990
991
992 /*
993  * obtain locks on all databases of specified priority
994  */
995 struct lock_request *ctdb_lock_alldb_prio(TALLOC_CTX *mem_ctx,
996                                           struct ctdb_context *ctdb,
997                                           uint32_t priority,
998                                           bool auto_mark,
999                                           void (*callback)(void *, bool),
1000                                           void *private_data)
1001 {
1002         if (priority < 1 || priority > NUM_DB_PRIORITIES) {
1003                 DEBUG(DEBUG_ERR, ("Invalid db priority: %u\n", priority));
1004                 return NULL;
1005         }
1006
1007         return ctdb_lock_internal(mem_ctx,
1008                                   ctdb,
1009                                   NULL,
1010                                   tdb_null,
1011                                   priority,
1012                                   callback,
1013                                   private_data,
1014                                   LOCK_ALLDB_PRIO,
1015                                   auto_mark);
1016 }
1017
1018
1019 /*
1020  * obtain locks on all databases
1021  */
1022 struct lock_request *ctdb_lock_alldb(TALLOC_CTX *mem_ctx,
1023                                      struct ctdb_context *ctdb,
1024                                      bool auto_mark,
1025                                      void (*callback)(void *, bool),
1026                                      void *private_data)
1027 {
1028         return ctdb_lock_internal(mem_ctx,
1029                                   ctdb,
1030                                   NULL,
1031                                   tdb_null,
1032                                   0,
1033                                   callback,
1034                                   private_data,
1035                                   LOCK_ALLDB,
1036                                   auto_mark);
1037 }
1038