lib/tdb/common/mutex.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Volker Lendecke 2012,2013
   7    Copyright (C) Stefan Metzmacher 2013,2014
   8    Copyright (C) Michael Adam 2014
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27 #include "tdb_private.h"
  28 #include "system/threads.h"
  29
  30 #ifdef USE_TDB_MUTEX_LOCKING
  31
  32 /*
  33  * If we run with mutexes, we store the "struct tdb_mutexes" at the
  34  * beginning of the file. We store an additional tdb_header right
  35  * beyond the mutex area, page aligned. All the offsets within the tdb
  36  * are relative to the area behind the mutex area. tdb->map_ptr points
  37  * behind the mmap area as well, so the read and write path in the
  38  * mutex case can remain unchanged.
  39  *
  40  * Early in the mutex development the mutexes were placed between the hash
  41  * chain pointers and the real tdb data. This had two drawbacks: First, it
  42  * made pointer calculations more complex. Second, we had to mmap the mutex
  43  * area twice. One was the normal map_ptr in the tdb. This frequently changed
  44  * from within tdb_oob. At least the Linux glibc robust mutex code assumes
  45  * constant pointers in memory, so a constantly changing mmap area destroys
  46  * the mutex list. So we had to mmap the first bytes of the file with a second
  47  * mmap call. With that scheme, very weird errors happened that could be
  48  * easily fixed by doing the mutex mmap in a second file. It seemed that
  49  * mapping the same memory area twice does not end up in accessing the same
  50  * physical page, looking at the mutexes in gdb it seemed that old data showed
  51  * up after some re-mapping. To avoid a separate mutex file, the code now puts
  52  * the real content of the tdb file after the mutex area. This way we do not
  53  * have overlapping mmap areas, the mutex area is mmapped once and not
  54  * changed, the tdb data area's mmap is constantly changed but does not
  55  * overlap.
  56  */
  57
  58 struct tdb_mutexes {
  59         struct tdb_header hdr;
  60
  61         /* protect allrecord_lock */
  62         pthread_mutex_t allrecord_mutex;
  63
  64         /*
  65          * F_UNLCK: free,
  66          * F_RDLCK: shared,
  67          * F_WRLCK: exclusive
  68          */
  69         short int allrecord_lock;
  70
  71         /*
  72          * Index 0 is the freelist mutex, followed by
  73          * one mutex per hashchain.
  74          */
  75         pthread_mutex_t hashchains[1];
  76 };
  77
  78 bool tdb_have_mutexes(struct tdb_context *tdb)
  79 {
  80         return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0);
  81 }
  82
  83 size_t tdb_mutex_size(struct tdb_context *tdb)
  84 {
  85         size_t mutex_size;
  86
  87         if (!tdb_have_mutexes(tdb)) {
  88                 return 0;
  89         }
  90
  91         mutex_size = sizeof(struct tdb_mutexes);
  92         mutex_size += tdb->hash_size * sizeof(pthread_mutex_t);
  93
  94         return TDB_ALIGN(mutex_size, tdb->page_size);
  95 }
  96
  97 /*
  98  * Get the index for a chain mutex
  99  */
 100 static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len,
 101                             unsigned *idx)
 102 {
 103         /*
 104          * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before
 105          * the 4 bytes of the freelist start and the hash chain that is about
 106          * to be locked. See lock_offset() where the freelist is -1 vs the
 107          * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in
 108          * the tdb file itself as data, we need to adjust the offset here.
 109          */
 110         const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t);
 111
 112         if (!tdb_have_mutexes(tdb)) {
 113                 return false;
 114         }
 115         if (len != 1) {
 116                 /* Possibly the allrecord lock */
 117                 return false;
 118         }
 119         if (off < freelist_lock_ofs) {
 120                 /* One of the special locks */
 121                 return false;
 122         }
 123         if (tdb->hash_size == 0) {
 124                 /* tdb not initialized yet, called from tdb_open_ex() */
 125                 return false;
 126         }
 127         if (off >= TDB_DATA_START(tdb->hash_size)) {
 128                 /* Single record lock from traverses */
 129                 return false;
 130         }
 131
 132         /*
 133          * Now we know it's a freelist or hash chain lock. Those are always 4
 134          * byte aligned. Paranoia check.
 135          */
 136         if ((off % sizeof(tdb_off_t)) != 0) {
 137                 abort();
 138         }
 139
 140         /*
 141          * Re-index the fcntl offset into an offset into the mutex array
 142          */
 143         off -= freelist_lock_ofs; /* rebase to index 0 */
 144         off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */
 145
 146         *idx = off;
 147         return true;
 148 }
 149
 150 static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb)
 151 {
 152         size_t i;
 153
 154         for (i=0; i < tdb->num_lockrecs; i++) {
 155                 bool ret;
 156                 unsigned idx;
 157
 158                 ret = tdb_mutex_index(tdb,
 159                                       tdb->lockrecs[i].off,
 160                                       tdb->lockrecs[i].count,
 161                                       &idx);
 162                 if (!ret) {
 163                         continue;
 164                 }
 165
 166                 if (idx == 0) {
 167                         /* this is the freelist mutex */
 168                         continue;
 169                 }
 170
 171                 return true;
 172         }
 173
 174         return false;
 175 }
 176
 177 static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag)
 178 {
 179         int ret;
 180
 181         if (waitflag) {
 182                 ret = pthread_mutex_lock(m);
 183         } else {
 184                 ret = pthread_mutex_trylock(m);
 185         }
 186         if (ret != EOWNERDEAD) {
 187                 return ret;
 188         }
 189
 190         /*
 191          * For chainlocks, we don't do any cleanup (yet?)
 192          */
 193         return pthread_mutex_consistent(m);
 194 }
 195
 196 static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag)
 197 {
 198         int ret;
 199
 200         if (waitflag) {
 201                 ret = pthread_mutex_lock(&m->allrecord_mutex);
 202         } else {
 203                 ret = pthread_mutex_trylock(&m->allrecord_mutex);
 204         }
 205         if (ret != EOWNERDEAD) {
 206                 return ret;
 207         }
 208
 209         /*
 210          * The allrecord lock holder died. We need to reset the allrecord_lock
 211          * to F_UNLCK. This should also be the indication for
 212          * tdb_needs_recovery.
 213          */
 214         m->allrecord_lock = F_UNLCK;
 215
 216         return pthread_mutex_consistent(&m->allrecord_mutex);
 217 }
 218
 219 bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
 220                     bool waitflag, int *pret)
 221 {
 222         struct tdb_mutexes *m = tdb->mutexes;
 223         pthread_mutex_t *chain;
 224         int ret;
 225         unsigned idx;
 226         bool allrecord_ok;
 227
 228         if (!tdb_mutex_index(tdb, off, len, &idx)) {
 229                 return false;
 230         }
 231         chain = &m->hashchains[idx];
 232
 233 again:
 234         ret = chain_mutex_lock(chain, waitflag);
 235         if (ret == EBUSY) {
 236                 ret = EAGAIN;
 237         }
 238         if (ret != 0) {
 239                 errno = ret;
 240                 goto fail;
 241         }
 242
 243         if (idx == 0) {
 244                 /*
 245                  * This is a freelist lock, which is independent to
 246                  * the allrecord lock. So we're done once we got the
 247                  * freelist mutex.
 248                  */
 249                 *pret = 0;
 250                 return true;
 251         }
 252
 253         if (tdb_have_mutex_chainlocks(tdb)) {
 254                 /*
 255                  * We can only check the allrecord lock once. If we do it with
 256                  * one chain mutex locked, we will deadlock with the allrecord
 257                  * locker process in the following way: We lock the first hash
 258                  * chain, we check for the allrecord lock. We keep the hash
 259                  * chain locked. Then the allrecord locker locks the
 260                  * allrecord_mutex. It walks the list of chain mutexes,
 261                  * locking them all in sequence. Meanwhile, we have the chain
 262                  * mutex locked, so the allrecord locker blocks trying to lock
 263                  * our chain mutex. Then we come in and try to lock the second
 264                  * chain lock, which in most cases will be the freelist. We
 265                  * see that the allrecord lock is locked and put ourselves on
 266                  * the allrecord_mutex. This will never be signalled though
 267                  * because the allrecord locker waits for us to give up the
 268                  * chain lock.
 269                  */
 270
 271                 *pret = 0;
 272                 return true;
 273         }
 274
 275         /*
 276          * Check if someone is has the allrecord lock: queue if so.
 277          */
 278
 279         allrecord_ok = false;
 280
 281         if (m->allrecord_lock == F_UNLCK) {
 282                 /*
 283                  * allrecord lock not taken
 284                  */
 285                 allrecord_ok = true;
 286         }
 287
 288         if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) {
 289                 /*
 290                  * allrecord shared lock taken, but we only want to read
 291                  */
 292                 allrecord_ok = true;
 293         }
 294
 295         if (allrecord_ok) {
 296                 *pret = 0;
 297                 return true;
 298         }
 299
 300         ret = pthread_mutex_unlock(chain);
 301         if (ret != 0) {
 302                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 303                          "(chain_mutex) failed: %s\n", strerror(ret)));
 304                 errno = ret;
 305                 goto fail;
 306         }
 307         ret = allrecord_mutex_lock(m, waitflag);
 308         if (ret == EBUSY) {
 309                 ret = EAGAIN;
 310         }
 311         if (ret != 0) {
 312                 if (waitflag || (ret != EAGAIN)) {
 313                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock"
 314                                  "(allrecord_mutex) failed: %s\n",
 315                                  waitflag ? "" : "try_",  strerror(ret)));
 316                 }
 317                 errno = ret;
 318                 goto fail;
 319         }
 320         ret = pthread_mutex_unlock(&m->allrecord_mutex);
 321         if (ret != 0) {
 322                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 323                          "(allrecord_mutex) failed: %s\n", strerror(ret)));
 324                 errno = ret;
 325                 goto fail;
 326         }
 327         goto again;
 328
 329 fail:
 330         *pret = -1;
 331         return true;
 332 }
 333
 334 bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
 335                       int *pret)
 336 {
 337         struct tdb_mutexes *m = tdb->mutexes;
 338         pthread_mutex_t *chain;
 339         int ret;
 340         unsigned idx;
 341
 342         if (!tdb_mutex_index(tdb, off, len, &idx)) {
 343                 return false;
 344         }
 345         chain = &m->hashchains[idx];
 346
 347         ret = pthread_mutex_unlock(chain);
 348         if (ret == 0) {
 349                 *pret = 0;
 350                 return true;
 351         }
 352         errno = ret;
 353         *pret = -1;
 354         return true;
 355 }
 356
 357 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
 358                              enum tdb_lock_flags flags)
 359 {
 360         struct tdb_mutexes *m = tdb->mutexes;
 361         int ret;
 362         uint32_t i;
 363         bool waitflag = (flags & TDB_LOCK_WAIT);
 364         int saved_errno;
 365
 366         if (tdb->flags & TDB_NOLOCK) {
 367                 return 0;
 368         }
 369
 370         if (flags & TDB_LOCK_MARK_ONLY) {
 371                 return 0;
 372         }
 373
 374         ret = allrecord_mutex_lock(m, waitflag);
 375         if (!waitflag && (ret == EBUSY)) {
 376                 errno = EAGAIN;
 377                 tdb->ecode = TDB_ERR_LOCK;
 378                 return -1;
 379         }
 380         if (ret != 0) {
 381                 if (!(flags & TDB_LOCK_PROBE)) {
 382                         TDB_LOG((tdb, TDB_DEBUG_TRACE,
 383                                  "allrecord_mutex_lock() failed: %s\n",
 384                                  strerror(ret)));
 385                 }
 386                 tdb->ecode = TDB_ERR_LOCK;
 387                 return -1;
 388         }
 389
 390         if (m->allrecord_lock != F_UNLCK) {
 391                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 392                          (int)m->allrecord_lock));
 393                 goto fail_unlock_allrecord_mutex;
 394         }
 395         m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK;
 396
 397         for (i=0; i<tdb->hash_size; i++) {
 398
 399                 /* ignore hashchains[0], the freelist */
 400                 pthread_mutex_t *chain = &m->hashchains[i+1];
 401
 402                 ret = chain_mutex_lock(chain, waitflag);
 403                 if (!waitflag && (ret == EBUSY)) {
 404                         errno = EAGAIN;
 405                         goto fail_unroll_allrecord_lock;
 406                 }
 407                 if (ret != 0) {
 408                         if (!(flags & TDB_LOCK_PROBE)) {
 409                                 TDB_LOG((tdb, TDB_DEBUG_TRACE,
 410                                          "chain_mutex_lock() failed: %s\n",
 411                                          strerror(ret)));
 412                         }
 413                         errno = ret;
 414                         goto fail_unroll_allrecord_lock;
 415                 }
 416
 417                 ret = pthread_mutex_unlock(chain);
 418                 if (ret != 0) {
 419                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 420                                  "(chainlock) failed: %s\n", strerror(ret)));
 421                         errno = ret;
 422                         goto fail_unroll_allrecord_lock;
 423                 }
 424         }
 425         /*
 426          * We leave this routine with m->allrecord_mutex locked
 427          */
 428         return 0;
 429
 430 fail_unroll_allrecord_lock:
 431         m->allrecord_lock = F_UNLCK;
 432
 433 fail_unlock_allrecord_mutex:
 434         saved_errno = errno;
 435         ret = pthread_mutex_unlock(&m->allrecord_mutex);
 436         if (ret != 0) {
 437                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 438                          "(allrecord_mutex) failed: %s\n", strerror(ret)));
 439         }
 440         errno = saved_errno;
 441         tdb->ecode = TDB_ERR_LOCK;
 442         return -1;
 443 }
 444
 445 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
 446 {
 447         struct tdb_mutexes *m = tdb->mutexes;
 448         int ret;
 449         uint32_t i;
 450
 451         if (tdb->flags & TDB_NOLOCK) {
 452                 return 0;
 453         }
 454
 455         /*
 456          * Our only caller tdb_allrecord_upgrade()
 457          * garantees that we already own the allrecord lock.
 458          *
 459          * Which means m->allrecord_mutex is still locked by us.
 460          */
 461
 462         if (m->allrecord_lock != F_RDLCK) {
 463                 tdb->ecode = TDB_ERR_LOCK;
 464                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 465                          (int)m->allrecord_lock));
 466                 return -1;
 467         }
 468
 469         m->allrecord_lock = F_WRLCK;
 470
 471         for (i=0; i<tdb->hash_size; i++) {
 472
 473                 /* ignore hashchains[0], the freelist */
 474                 pthread_mutex_t *chain = &m->hashchains[i+1];
 475
 476                 ret = chain_mutex_lock(chain, true);
 477                 if (ret != 0) {
 478                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock"
 479                                  "(chainlock) failed: %s\n", strerror(ret)));
 480                         goto fail_unroll_allrecord_lock;
 481                 }
 482
 483                 ret = pthread_mutex_unlock(chain);
 484                 if (ret != 0) {
 485                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 486                                  "(chainlock) failed: %s\n", strerror(ret)));
 487                         goto fail_unroll_allrecord_lock;
 488                 }
 489         }
 490
 491         return 0;
 492
 493 fail_unroll_allrecord_lock:
 494         m->allrecord_lock = F_RDLCK;
 495         tdb->ecode = TDB_ERR_LOCK;
 496         return -1;
 497 }
 498
 499 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
 500 {
 501         struct tdb_mutexes *m = tdb->mutexes;
 502
 503         /*
 504          * Our only caller tdb_allrecord_upgrade() (in the error case)
 505          * garantees that we already own the allrecord lock.
 506          *
 507          * Which means m->allrecord_mutex is still locked by us.
 508          */
 509
 510         if (m->allrecord_lock != F_WRLCK) {
 511                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 512                          (int)m->allrecord_lock));
 513                 return;
 514         }
 515
 516         m->allrecord_lock = F_RDLCK;
 517         return;
 518 }
 519
 520
 521 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
 522 {
 523         struct tdb_mutexes *m = tdb->mutexes;
 524         short old;
 525         int ret;
 526
 527         if (tdb->flags & TDB_NOLOCK) {
 528                 return 0;
 529         }
 530
 531         /*
 532          * Our only callers tdb_allrecord_unlock() and
 533          * tdb_allrecord_lock() (in the error path)
 534          * garantee that we already own the allrecord lock.
 535          *
 536          * Which means m->allrecord_mutex is still locked by us.
 537          */
 538
 539         if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) {
 540                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 541                          (int)m->allrecord_lock));
 542                 return -1;
 543         }
 544
 545         old = m->allrecord_lock;
 546         m->allrecord_lock = F_UNLCK;
 547
 548         ret = pthread_mutex_unlock(&m->allrecord_mutex);
 549         if (ret != 0) {
 550                 m->allrecord_lock = old;
 551                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 552                          "(allrecord_mutex) failed: %s\n", strerror(ret)));
 553                 return -1;
 554         }
 555         return 0;
 556 }
 557
 558 int tdb_mutex_init(struct tdb_context *tdb)
 559 {
 560         struct tdb_mutexes *m;
 561         pthread_mutexattr_t ma;
 562         int i, ret;
 563
 564         ret = tdb_mutex_mmap(tdb);
 565         if (ret == -1) {
 566                 return -1;
 567         }
 568         m = tdb->mutexes;
 569
 570         ret = pthread_mutexattr_init(&ma);
 571         if (ret != 0) {
 572                 goto fail_munmap;
 573         }
 574         ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 575         if (ret != 0) {
 576                 goto fail;
 577         }
 578         ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 579         if (ret != 0) {
 580                 goto fail;
 581         }
 582         ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 583         if (ret != 0) {
 584                 goto fail;
 585         }
 586
 587         for (i=0; i<tdb->hash_size+1; i++) {
 588                 pthread_mutex_t *chain = &m->hashchains[i];
 589
 590                 ret = pthread_mutex_init(chain, &ma);
 591                 if (ret != 0) {
 592                         goto fail;
 593                 }
 594         }
 595
 596         m->allrecord_lock = F_UNLCK;
 597
 598         ret = pthread_mutex_init(&m->allrecord_mutex, &ma);
 599         if (ret != 0) {
 600                 goto fail;
 601         }
 602         ret = 0;
 603 fail:
 604         pthread_mutexattr_destroy(&ma);
 605 fail_munmap:
 606
 607         if (ret == 0) {
 608                 return 0;
 609         }
 610
 611         tdb_mutex_munmap(tdb);
 612
 613         errno = ret;
 614         return -1;
 615 }
 616
 617 int tdb_mutex_mmap(struct tdb_context *tdb)
 618 {
 619         size_t len;
 620         void *ptr;
 621
 622         len = tdb_mutex_size(tdb);
 623         if (len == 0) {
 624                 return 0;
 625         }
 626
 627         if (tdb->mutexes != NULL) {
 628                 return 0;
 629         }
 630
 631         ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE,
 632                    tdb->fd, 0);
 633         if (ptr == MAP_FAILED) {
 634                 return -1;
 635         }
 636         tdb->mutexes = (struct tdb_mutexes *)ptr;
 637
 638         return 0;
 639 }
 640
 641 int tdb_mutex_munmap(struct tdb_context *tdb)
 642 {
 643         size_t len;
 644         int ret;
 645
 646         len = tdb_mutex_size(tdb);
 647         if (len == 0) {
 648                 return 0;
 649         }
 650
 651         ret = munmap(tdb->mutexes, len);
 652         if (ret == -1) {
 653                 return -1;
 654         }
 655         tdb->mutexes = NULL;
 656
 657         return 0;
 658 }
 659
 660 static bool tdb_mutex_locking_cached;
 661
 662 static bool tdb_mutex_locking_supported(void)
 663 {
 664         pthread_mutexattr_t ma;
 665         pthread_mutex_t m;
 666         int ret;
 667         static bool initialized;
 668
 669         if (initialized) {
 670                 return tdb_mutex_locking_cached;
 671         }
 672
 673         initialized = true;
 674
 675         ret = pthread_mutexattr_init(&ma);
 676         if (ret != 0) {
 677                 return false;
 678         }
 679         ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 680         if (ret != 0) {
 681                 goto cleanup_ma;
 682         }
 683         ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 684         if (ret != 0) {
 685                 goto cleanup_ma;
 686         }
 687         ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 688         if (ret != 0) {
 689                 goto cleanup_ma;
 690         }
 691         ret = pthread_mutex_init(&m, &ma);
 692         if (ret != 0) {
 693                 goto cleanup_ma;
 694         }
 695         ret = pthread_mutex_lock(&m);
 696         if (ret != 0) {
 697                 goto cleanup_m;
 698         }
 699         /*
 700          * This makes sure we have real mutexes
 701          * from a threading library instead of just
 702          * stubs from libc.
 703          */
 704         ret = pthread_mutex_lock(&m);
 705         if (ret != EDEADLK) {
 706                 goto cleanup_lock;
 707         }
 708         ret = pthread_mutex_unlock(&m);
 709         if (ret != 0) {
 710                 goto cleanup_m;
 711         }
 712
 713         tdb_mutex_locking_cached = true;
 714         goto cleanup_m;
 715
 716 cleanup_lock:
 717         pthread_mutex_unlock(&m);
 718 cleanup_m:
 719         pthread_mutex_destroy(&m);
 720 cleanup_ma:
 721         pthread_mutexattr_destroy(&ma);
 722         return tdb_mutex_locking_cached;
 723 }
 724
 725 static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR;
 726 static pid_t tdb_robust_mutex_pid = -1;
 727
 728 static bool tdb_robust_mutex_setup_sigchild(void (*handler)(int),
 729                         void (**p_old_handler)(int))
 730 {
 731 #ifdef HAVE_SIGACTION
 732         struct sigaction act;
 733         struct sigaction oldact;
 734
 735         memset(&act, '\0', sizeof(act));
 736
 737         act.sa_handler = handler;
 738 #ifdef SA_RESTART
 739         act.sa_flags = SA_RESTART;
 740 #endif
 741         sigemptyset(&act.sa_mask);
 742         sigaddset(&act.sa_mask, SIGCHLD);
 743         sigaction(SIGCHLD, &act, &oldact);
 744         if (p_old_handler) {
 745                 *p_old_handler = oldact.sa_handler;
 746         }
 747         return true;
 748 #else /* !HAVE_SIGACTION */
 749         return false;
 750 #endif
 751 }
 752
 753 static void tdb_robust_mutex_handler(int sig)
 754 {
 755         pid_t child_pid = tdb_robust_mutex_pid;
 756
 757         if (child_pid != -1) {
 758                 pid_t pid;
 759
 760                 pid = waitpid(child_pid, NULL, WNOHANG);
 761                 if (pid == -1) {
 762                         switch (errno) {
 763                         case ECHILD:
 764                                 tdb_robust_mutex_pid = -1;
 765                                 return;
 766
 767                         default:
 768                                 return;
 769                         }
 770                 }
 771                 if (pid == child_pid) {
 772                         tdb_robust_mutex_pid = -1;
 773                         return;
 774                 }
 775         }
 776
 777         if (tdb_robust_mutext_old_handler == SIG_DFL) {
 778                 return;
 779         }
 780         if (tdb_robust_mutext_old_handler == SIG_IGN) {
 781                 return;
 782         }
 783         if (tdb_robust_mutext_old_handler == SIG_ERR) {
 784                 return;
 785         }
 786
 787         tdb_robust_mutext_old_handler(sig);
 788 }
 789
 790 static void tdb_robust_mutex_wait_for_child(pid_t *child_pid)
 791 {
 792         int options = WNOHANG;
 793
 794         if (*child_pid == -1) {
 795                 return;
 796         }
 797
 798         while (tdb_robust_mutex_pid > 0) {
 799                 pid_t pid;
 800
 801                 /*
 802                  * First we try with WNOHANG, as the process might not exist
 803                  * anymore. Once we've sent SIGKILL we block waiting for the
 804                  * exit.
 805                  */
 806                 pid = waitpid(*child_pid, NULL, options);
 807                 if (pid == -1) {
 808                         if (errno == EINTR) {
 809                                 continue;
 810                         } else if (errno == ECHILD) {
 811                                 break;
 812                         } else {
 813                                 abort();
 814                         }
 815                 }
 816                 if (pid == *child_pid) {
 817                         break;
 818                 }
 819
 820                 kill(*child_pid, SIGKILL);
 821                 options = 0;
 822         }
 823
 824         tdb_robust_mutex_pid = -1;
 825         *child_pid = -1;
 826 }
 827
 828 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
 829 {
 830         void *ptr = NULL;
 831         pthread_mutex_t *m = NULL;
 832         pthread_mutexattr_t ma;
 833         int ret = 1;
 834         int pipe_down[2] = { -1, -1 };
 835         int pipe_up[2] = { -1, -1 };
 836         ssize_t nread;
 837         char c = 0;
 838         bool ok;
 839         static bool initialized;
 840         pid_t saved_child_pid = -1;
 841         bool cleanup_ma = false;
 842
 843         if (initialized) {
 844                 return tdb_mutex_locking_cached;
 845         }
 846
 847         initialized = true;
 848
 849         ok = tdb_mutex_locking_supported();
 850         if (!ok) {
 851                 return false;
 852         }
 853
 854         tdb_mutex_locking_cached = false;
 855
 856         ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
 857                    MAP_SHARED|MAP_ANON, -1 /* fd */, 0);
 858         if (ptr == MAP_FAILED) {
 859                 return false;
 860         }
 861
 862         ret = pipe(pipe_down);
 863         if (ret != 0) {
 864                 goto cleanup;
 865         }
 866         ret = pipe(pipe_up);
 867         if (ret != 0) {
 868                 goto cleanup;
 869         }
 870
 871         ret = pthread_mutexattr_init(&ma);
 872         if (ret != 0) {
 873                 goto cleanup;
 874         }
 875         cleanup_ma = true;
 876         ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 877         if (ret != 0) {
 878                 goto cleanup;
 879         }
 880         ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 881         if (ret != 0) {
 882                 goto cleanup;
 883         }
 884         ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 885         if (ret != 0) {
 886                 goto cleanup;
 887         }
 888         ret = pthread_mutex_init(ptr, &ma);
 889         if (ret != 0) {
 890                 goto cleanup;
 891         }
 892         m = (pthread_mutex_t *)ptr;
 893
 894         if (tdb_robust_mutex_setup_sigchild(tdb_robust_mutex_handler,
 895                         &tdb_robust_mutext_old_handler) == false) {
 896                 goto cleanup;
 897         }
 898
 899         tdb_robust_mutex_pid = fork();
 900         saved_child_pid = tdb_robust_mutex_pid;
 901         if (tdb_robust_mutex_pid == 0) {
 902                 size_t nwritten;
 903                 close(pipe_down[1]);
 904                 close(pipe_up[0]);
 905                 ret = pthread_mutex_lock(m);
 906                 nwritten = write(pipe_up[1], &ret, sizeof(ret));
 907                 if (nwritten != sizeof(ret)) {
 908                         _exit(1);
 909                 }
 910                 if (ret != 0) {
 911                         _exit(1);
 912                 }
 913                 nread = read(pipe_down[0], &c, 1);
 914                 if (nread != 1) {
 915                         _exit(1);
 916                 }
 917                 /* leave locked */
 918                 _exit(0);
 919         }
 920         if (tdb_robust_mutex_pid == -1) {
 921                 goto cleanup;
 922         }
 923         close(pipe_down[0]);
 924         pipe_down[0] = -1;
 925         close(pipe_up[1]);
 926         pipe_up[1] = -1;
 927
 928         nread = read(pipe_up[0], &ret, sizeof(ret));
 929         if (nread != sizeof(ret)) {
 930                 goto cleanup;
 931         }
 932
 933         ret = pthread_mutex_trylock(m);
 934         if (ret != EBUSY) {
 935                 if (ret == 0) {
 936                         pthread_mutex_unlock(m);
 937                 }
 938                 goto cleanup;
 939         }
 940
 941         if (write(pipe_down[1], &c, 1) != 1) {
 942                 goto cleanup;
 943         }
 944
 945         nread = read(pipe_up[0], &c, 1);
 946         if (nread != 0) {
 947                 goto cleanup;
 948         }
 949
 950         tdb_robust_mutex_wait_for_child(&saved_child_pid);
 951
 952         ret = pthread_mutex_trylock(m);
 953         if (ret != EOWNERDEAD) {
 954                 if (ret == 0) {
 955                         pthread_mutex_unlock(m);
 956                 }
 957                 goto cleanup;
 958         }
 959
 960         ret = pthread_mutex_consistent(m);
 961         if (ret != 0) {
 962                 goto cleanup;
 963         }
 964
 965         ret = pthread_mutex_trylock(m);
 966         if (ret != EDEADLK && ret != EBUSY) {
 967                 pthread_mutex_unlock(m);
 968                 goto cleanup;
 969         }
 970
 971         ret = pthread_mutex_unlock(m);
 972         if (ret != 0) {
 973                 goto cleanup;
 974         }
 975
 976         tdb_mutex_locking_cached = true;
 977
 978 cleanup:
 979         /*
 980          * Note that we don't reset the signal handler we just reset
 981          * tdb_robust_mutex_pid to -1. This is ok as this code path is only
 982          * called once per process.
 983          *
 984          * Leaving our signal handler avoids races with other threads potentialy
 985          * setting up their SIGCHLD handlers.
 986          *
 987          * The worst thing that can happen is that the other newer signal
 988          * handler will get the SIGCHLD signal for our child and/or reap the
 989          * child with a wait() function. tdb_robust_mutex_wait_for_child()
 990          * handles the case where waitpid returns ECHILD.
 991          */
 992         tdb_robust_mutex_wait_for_child(&saved_child_pid);
 993
 994         if (m != NULL) {
 995                 pthread_mutex_destroy(m);
 996         }
 997         if (cleanup_ma) {
 998                 pthread_mutexattr_destroy(&ma);
 999         }
1000         if (pipe_down[0] != -1) {
1001                 close(pipe_down[0]);
1002         }
1003         if (pipe_down[1] != -1) {
1004                 close(pipe_down[1]);
1005         }
1006         if (pipe_up[0] != -1) {
1007                 close(pipe_up[0]);
1008         }
1009         if (pipe_up[1] != -1) {
1010                 close(pipe_up[1]);
1011         }
1012         if (ptr != NULL) {
1013                 munmap(ptr, sizeof(pthread_mutex_t));
1014         }
1015
1016         return tdb_mutex_locking_cached;
1017 }
1018
1019 #else
1020
1021 size_t tdb_mutex_size(struct tdb_context *tdb)
1022 {
1023         return 0;
1024 }
1025
1026 bool tdb_have_mutexes(struct tdb_context *tdb)
1027 {
1028         return false;
1029 }
1030
1031 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
1032                              enum tdb_lock_flags flags)
1033 {
1034         tdb->ecode = TDB_ERR_LOCK;
1035         return -1;
1036 }
1037
1038 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
1039 {
1040         return -1;
1041 }
1042
1043 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
1044 {
1045         tdb->ecode = TDB_ERR_LOCK;
1046         return -1;
1047 }
1048
1049 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
1050 {
1051         return;
1052 }
1053
1054 int tdb_mutex_mmap(struct tdb_context *tdb)
1055 {
1056         errno = ENOSYS;
1057         return -1;
1058 }
1059
1060 int tdb_mutex_munmap(struct tdb_context *tdb)
1061 {
1062         errno = ENOSYS;
1063         return -1;
1064 }
1065
1066 int tdb_mutex_init(struct tdb_context *tdb)
1067 {
1068         errno = ENOSYS;
1069         return -1;
1070 }
1071
1072 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
1073 {
1074         return false;
1075 }
1076
1077 #endif