2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Volker Lendecke 2012,2013
7 Copyright (C) Stefan Metzmacher 2013,2014
8 Copyright (C) Michael Adam 2014
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 #include "tdb_private.h"
28 #include "system/threads.h"
30 #ifdef USE_TDB_MUTEX_LOCKING
33 * If we run with mutexes, we store the "struct tdb_mutexes" at the
34 * beginning of the file. We store an additional tdb_header right
35 * beyond the mutex area, page aligned. All the offsets within the tdb
36 * are relative to the area behind the mutex area. tdb->map_ptr points
37 * behind the mmap area as well, so the read and write path in the
38 * mutex case can remain unchanged.
40 * Early in the mutex development the mutexes were placed between the hash
41 * chain pointers and the real tdb data. This had two drawbacks: First, it
42 * made pointer calculations more complex. Second, we had to mmap the mutex
43 * area twice. One was the normal map_ptr in the tdb. This frequently changed
44 * from within tdb_oob. At least the Linux glibc robust mutex code assumes
45 * constant pointers in memory, so a constantly changing mmap area destroys
46 * the mutex list. So we had to mmap the first bytes of the file with a second
47 * mmap call. With that scheme, very weird errors happened that could be
48 * easily fixed by doing the mutex mmap in a second file. It seemed that
49 * mapping the same memory area twice does not end up in accessing the same
50 * physical page, looking at the mutexes in gdb it seemed that old data showed
51 * up after some re-mapping. To avoid a separate mutex file, the code now puts
52 * the real content of the tdb file after the mutex area. This way we do not
53 * have overlapping mmap areas, the mutex area is mmapped once and not
54 * changed, the tdb data area's mmap is constantly changed but does not
59 struct tdb_header hdr;
61 /* protect allrecord_lock */
62 pthread_mutex_t allrecord_mutex;
69 short int allrecord_lock;
72 * Index 0 is the freelist mutex, followed by
73 * one mutex per hashchain.
75 pthread_mutex_t hashchains[1];
78 bool tdb_have_mutexes(struct tdb_context *tdb)
80 return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0);
83 size_t tdb_mutex_size(struct tdb_context *tdb)
87 if (!tdb_have_mutexes(tdb)) {
91 mutex_size = sizeof(struct tdb_mutexes);
92 mutex_size += tdb->hash_size * sizeof(pthread_mutex_t);
94 return TDB_ALIGN(mutex_size, tdb->page_size);
98 * Get the index for a chain mutex
100 static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len,
104 * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before
105 * the 4 bytes of the freelist start and the hash chain that is about
106 * to be locked. See lock_offset() where the freelist is -1 vs the
107 * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in
108 * the tdb file itself as data, we need to adjust the offset here.
110 const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t);
112 if (!tdb_have_mutexes(tdb)) {
116 /* Possibly the allrecord lock */
119 if (off < freelist_lock_ofs) {
120 /* One of the special locks */
123 if (tdb->hash_size == 0) {
124 /* tdb not initialized yet, called from tdb_open_ex() */
127 if (off >= TDB_DATA_START(tdb->hash_size)) {
128 /* Single record lock from traverses */
133 * Now we know it's a freelist or hash chain lock. Those are always 4
134 * byte aligned. Paranoia check.
136 if ((off % sizeof(tdb_off_t)) != 0) {
141 * Re-index the fcntl offset into an offset into the mutex array
143 off -= freelist_lock_ofs; /* rebase to index 0 */
144 off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */
150 static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb)
154 for (i=0; i < tdb->num_lockrecs; i++) {
158 ret = tdb_mutex_index(tdb,
159 tdb->lockrecs[i].off,
160 tdb->lockrecs[i].count,
167 /* this is the freelist mutex */
177 static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag)
182 ret = pthread_mutex_lock(m);
184 ret = pthread_mutex_trylock(m);
186 if (ret != EOWNERDEAD) {
191 * For chainlocks, we don't do any cleanup (yet?)
193 return pthread_mutex_consistent(m);
196 static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag)
201 ret = pthread_mutex_lock(&m->allrecord_mutex);
203 ret = pthread_mutex_trylock(&m->allrecord_mutex);
205 if (ret != EOWNERDEAD) {
210 * The allrecord lock holder died. We need to reset the allrecord_lock
211 * to F_UNLCK. This should also be the indication for
212 * tdb_needs_recovery.
214 m->allrecord_lock = F_UNLCK;
216 return pthread_mutex_consistent(&m->allrecord_mutex);
219 bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
220 bool waitflag, int *pret)
222 struct tdb_mutexes *m = tdb->mutexes;
223 pthread_mutex_t *chain;
228 if (!tdb_mutex_index(tdb, off, len, &idx)) {
231 chain = &m->hashchains[idx];
234 ret = chain_mutex_lock(chain, waitflag);
245 * This is a freelist lock, which is independent to
246 * the allrecord lock. So we're done once we got the
253 if (tdb_have_mutex_chainlocks(tdb)) {
255 * We can only check the allrecord lock once. If we do it with
256 * one chain mutex locked, we will deadlock with the allrecord
257 * locker process in the following way: We lock the first hash
258 * chain, we check for the allrecord lock. We keep the hash
259 * chain locked. Then the allrecord locker locks the
260 * allrecord_mutex. It walks the list of chain mutexes,
261 * locking them all in sequence. Meanwhile, we have the chain
262 * mutex locked, so the allrecord locker blocks trying to lock
263 * our chain mutex. Then we come in and try to lock the second
264 * chain lock, which in most cases will be the freelist. We
265 * see that the allrecord lock is locked and put ourselves on
266 * the allrecord_mutex. This will never be signalled though
267 * because the allrecord locker waits for us to give up the
276 * Check if someone is has the allrecord lock: queue if so.
279 allrecord_ok = false;
281 if (m->allrecord_lock == F_UNLCK) {
283 * allrecord lock not taken
288 if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) {
290 * allrecord shared lock taken, but we only want to read
300 ret = pthread_mutex_unlock(chain);
302 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
303 "(chain_mutex) failed: %s\n", strerror(ret)));
307 ret = allrecord_mutex_lock(m, waitflag);
312 if (waitflag || (ret != EAGAIN)) {
313 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock"
314 "(allrecord_mutex) failed: %s\n",
315 waitflag ? "" : "try_", strerror(ret)));
320 ret = pthread_mutex_unlock(&m->allrecord_mutex);
322 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
323 "(allrecord_mutex) failed: %s\n", strerror(ret)));
334 bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
337 struct tdb_mutexes *m = tdb->mutexes;
338 pthread_mutex_t *chain;
342 if (!tdb_mutex_index(tdb, off, len, &idx)) {
345 chain = &m->hashchains[idx];
347 ret = pthread_mutex_unlock(chain);
357 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
358 enum tdb_lock_flags flags)
360 struct tdb_mutexes *m = tdb->mutexes;
363 bool waitflag = (flags & TDB_LOCK_WAIT);
366 if (tdb->flags & TDB_NOLOCK) {
370 if (flags & TDB_LOCK_MARK_ONLY) {
374 ret = allrecord_mutex_lock(m, waitflag);
375 if (!waitflag && (ret == EBUSY)) {
377 tdb->ecode = TDB_ERR_LOCK;
381 if (!(flags & TDB_LOCK_PROBE)) {
382 TDB_LOG((tdb, TDB_DEBUG_TRACE,
383 "allrecord_mutex_lock() failed: %s\n",
386 tdb->ecode = TDB_ERR_LOCK;
390 if (m->allrecord_lock != F_UNLCK) {
391 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
392 (int)m->allrecord_lock));
393 goto fail_unlock_allrecord_mutex;
395 m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK;
397 for (i=0; i<tdb->hash_size; i++) {
399 /* ignore hashchains[0], the freelist */
400 pthread_mutex_t *chain = &m->hashchains[i+1];
402 ret = chain_mutex_lock(chain, waitflag);
403 if (!waitflag && (ret == EBUSY)) {
405 goto fail_unroll_allrecord_lock;
408 if (!(flags & TDB_LOCK_PROBE)) {
409 TDB_LOG((tdb, TDB_DEBUG_TRACE,
410 "chain_mutex_lock() failed: %s\n",
414 goto fail_unroll_allrecord_lock;
417 ret = pthread_mutex_unlock(chain);
419 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
420 "(chainlock) failed: %s\n", strerror(ret)));
422 goto fail_unroll_allrecord_lock;
426 * We leave this routine with m->allrecord_mutex locked
430 fail_unroll_allrecord_lock:
431 m->allrecord_lock = F_UNLCK;
433 fail_unlock_allrecord_mutex:
435 ret = pthread_mutex_unlock(&m->allrecord_mutex);
437 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
438 "(allrecord_mutex) failed: %s\n", strerror(ret)));
441 tdb->ecode = TDB_ERR_LOCK;
445 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
447 struct tdb_mutexes *m = tdb->mutexes;
451 if (tdb->flags & TDB_NOLOCK) {
456 * Our only caller tdb_allrecord_upgrade()
457 * garantees that we already own the allrecord lock.
459 * Which means m->allrecord_mutex is still locked by us.
462 if (m->allrecord_lock != F_RDLCK) {
463 tdb->ecode = TDB_ERR_LOCK;
464 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
465 (int)m->allrecord_lock));
469 m->allrecord_lock = F_WRLCK;
471 for (i=0; i<tdb->hash_size; i++) {
473 /* ignore hashchains[0], the freelist */
474 pthread_mutex_t *chain = &m->hashchains[i+1];
476 ret = chain_mutex_lock(chain, true);
478 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock"
479 "(chainlock) failed: %s\n", strerror(ret)));
480 goto fail_unroll_allrecord_lock;
483 ret = pthread_mutex_unlock(chain);
485 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
486 "(chainlock) failed: %s\n", strerror(ret)));
487 goto fail_unroll_allrecord_lock;
493 fail_unroll_allrecord_lock:
494 m->allrecord_lock = F_RDLCK;
495 tdb->ecode = TDB_ERR_LOCK;
499 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
501 struct tdb_mutexes *m = tdb->mutexes;
504 * Our only caller tdb_allrecord_upgrade() (in the error case)
505 * garantees that we already own the allrecord lock.
507 * Which means m->allrecord_mutex is still locked by us.
510 if (m->allrecord_lock != F_WRLCK) {
511 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
512 (int)m->allrecord_lock));
516 m->allrecord_lock = F_RDLCK;
521 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
523 struct tdb_mutexes *m = tdb->mutexes;
527 if (tdb->flags & TDB_NOLOCK) {
532 * Our only callers tdb_allrecord_unlock() and
533 * tdb_allrecord_lock() (in the error path)
534 * garantee that we already own the allrecord lock.
536 * Which means m->allrecord_mutex is still locked by us.
539 if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) {
540 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
541 (int)m->allrecord_lock));
545 old = m->allrecord_lock;
546 m->allrecord_lock = F_UNLCK;
548 ret = pthread_mutex_unlock(&m->allrecord_mutex);
550 m->allrecord_lock = old;
551 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
552 "(allrecord_mutex) failed: %s\n", strerror(ret)));
558 int tdb_mutex_init(struct tdb_context *tdb)
560 struct tdb_mutexes *m;
561 pthread_mutexattr_t ma;
564 ret = tdb_mutex_mmap(tdb);
570 ret = pthread_mutexattr_init(&ma);
574 ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
578 ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
582 ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
587 for (i=0; i<tdb->hash_size+1; i++) {
588 pthread_mutex_t *chain = &m->hashchains[i];
590 ret = pthread_mutex_init(chain, &ma);
596 m->allrecord_lock = F_UNLCK;
598 ret = pthread_mutex_init(&m->allrecord_mutex, &ma);
604 pthread_mutexattr_destroy(&ma);
606 tdb_mutex_munmap(tdb);
616 int tdb_mutex_mmap(struct tdb_context *tdb)
621 len = tdb_mutex_size(tdb);
626 ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE,
628 if (ptr == MAP_FAILED) {
631 tdb->mutexes = (struct tdb_mutexes *)ptr;
636 int tdb_mutex_munmap(struct tdb_context *tdb)
640 len = tdb_mutex_size(tdb);
645 return munmap(tdb->mutexes, len);
648 static bool tdb_mutex_locking_cached;
650 static bool tdb_mutex_locking_supported(void)
652 pthread_mutexattr_t ma;
655 static bool initialized;
658 return tdb_mutex_locking_cached;
663 ret = pthread_mutexattr_init(&ma);
667 ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
671 ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
675 ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
679 ret = pthread_mutex_init(&m, &ma);
683 ret = pthread_mutex_lock(&m);
688 * This makes sure we have real mutexes
689 * from a threading library instead of just
692 ret = pthread_mutex_lock(&m);
693 if (ret != EDEADLK) {
696 ret = pthread_mutex_unlock(&m);
701 tdb_mutex_locking_cached = true;
705 pthread_mutex_unlock(&m);
707 pthread_mutex_destroy(&m);
709 pthread_mutexattr_destroy(&ma);
710 return tdb_mutex_locking_cached;
713 static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR;
714 static pid_t tdb_robust_mutex_pid = -1;
716 static bool tdb_robust_mutex_setup_sigchild(void (*handler)(int),
717 void (**p_old_handler)(int))
719 #ifdef HAVE_SIGACTION
720 struct sigaction act;
721 struct sigaction oldact;
723 memset(&act, '\0', sizeof(act));
725 act.sa_handler = handler;
727 act.sa_flags = SA_RESTART;
729 sigemptyset(&act.sa_mask);
730 sigaddset(&act.sa_mask, SIGCHLD);
731 sigaction(SIGCHLD, &act, &oldact);
733 *p_old_handler = oldact.sa_handler;
736 #else /* !HAVE_SIGACTION */
741 static void tdb_robust_mutex_handler(int sig)
743 if (tdb_robust_mutex_pid != -1) {
747 pid = waitpid(tdb_robust_mutex_pid, &status, WNOHANG);
748 if (pid == tdb_robust_mutex_pid) {
749 tdb_robust_mutex_pid = -1;
754 if (tdb_robust_mutext_old_handler == SIG_DFL) {
757 if (tdb_robust_mutext_old_handler == SIG_IGN) {
760 if (tdb_robust_mutext_old_handler == SIG_ERR) {
764 tdb_robust_mutext_old_handler(sig);
767 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
770 pthread_mutex_t *m = NULL;
771 pthread_mutexattr_t ma;
773 int pipe_down[2] = { -1, -1 };
774 int pipe_up[2] = { -1, -1 };
778 static bool initialized;
779 sigset_t mask, old_mask, suspend_mask;
780 bool cleanup_ma = false;
781 bool cleanup_sigmask = false;
784 return tdb_mutex_locking_cached;
789 ok = tdb_mutex_locking_supported();
794 tdb_mutex_locking_cached = false;
796 ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
797 MAP_SHARED|MAP_ANON, -1 /* fd */, 0);
798 if (ptr == MAP_FAILED) {
802 ret = pipe(pipe_down);
811 ret = pthread_mutexattr_init(&ma);
816 ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
820 ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
824 ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
828 ret = pthread_mutex_init(ptr, &ma);
832 m = (pthread_mutex_t *)ptr;
835 * Block SIGCHLD so we can atomically wait for it later with
839 sigaddset(&mask, SIGCHLD);
840 ret = pthread_sigmask(SIG_BLOCK, &mask, &old_mask);
844 cleanup_sigmask = true;
845 suspend_mask = old_mask;
846 sigdelset(&suspend_mask, SIGCHLD);
848 if (tdb_robust_mutex_setup_sigchild(tdb_robust_mutex_handler,
849 &tdb_robust_mutext_old_handler) == false) {
853 tdb_robust_mutex_pid = fork();
854 if (tdb_robust_mutex_pid == 0) {
858 ret = pthread_mutex_lock(m);
859 nwritten = write(pipe_up[1], &ret, sizeof(ret));
860 if (nwritten != sizeof(ret)) {
866 nread = read(pipe_down[0], &c, 1);
873 if (tdb_robust_mutex_pid == -1) {
881 nread = read(pipe_up[0], &ret, sizeof(ret));
882 if (nread != sizeof(ret)) {
886 ret = pthread_mutex_trylock(m);
889 pthread_mutex_unlock(m);
894 if (write(pipe_down[1], &c, 1) != 1) {
898 nread = read(pipe_up[0], &c, 1);
903 while (tdb_robust_mutex_pid > 0) {
904 ret = sigsuspend(&suspend_mask);
905 if (ret != -1 || errno != EINTR) {
909 tdb_robust_mutex_setup_sigchild(tdb_robust_mutext_old_handler, NULL);
910 tdb_robust_mutext_old_handler = SIG_ERR;
912 ret = pthread_mutex_trylock(m);
913 if (ret != EOWNERDEAD) {
915 pthread_mutex_unlock(m);
920 ret = pthread_mutex_consistent(m);
925 ret = pthread_mutex_trylock(m);
926 if (ret != EDEADLK) {
927 pthread_mutex_unlock(m);
931 ret = pthread_mutex_unlock(m);
936 tdb_mutex_locking_cached = true;
939 while (tdb_robust_mutex_pid > 0) {
940 kill(tdb_robust_mutex_pid, SIGKILL);
941 ret = sigsuspend(&suspend_mask);
942 if (ret != -1 || errno != EINTR) {
947 if (tdb_robust_mutext_old_handler != SIG_ERR) {
948 tdb_robust_mutex_setup_sigchild(tdb_robust_mutext_old_handler, NULL);
950 if (cleanup_sigmask) {
951 ret = pthread_sigmask(SIG_SETMASK, &old_mask, NULL);
957 pthread_mutex_destroy(m);
960 pthread_mutexattr_destroy(&ma);
962 if (pipe_down[0] != -1) {
965 if (pipe_down[1] != -1) {
968 if (pipe_up[0] != -1) {
971 if (pipe_up[1] != -1) {
975 munmap(ptr, sizeof(pthread_mutex_t));
978 return tdb_mutex_locking_cached;
983 size_t tdb_mutex_size(struct tdb_context *tdb)
988 bool tdb_have_mutexes(struct tdb_context *tdb)
993 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
994 enum tdb_lock_flags flags)
996 tdb->ecode = TDB_ERR_LOCK;
1000 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
1005 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
1007 tdb->ecode = TDB_ERR_LOCK;
1011 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
1016 int tdb_mutex_mmap(struct tdb_context *tdb)
1022 int tdb_mutex_munmap(struct tdb_context *tdb)
1028 int tdb_mutex_init(struct tdb_context *tdb)
1034 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)