lib/tdb/common/transaction.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              2005
   7
   8      ** NOTE! The following LGPL license applies to the tdb
   9      ** library. This does NOT imply that all of Samba is released
  10      ** under the LGPL
  11
  12    This library is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU Lesser General Public
  14    License as published by the Free Software Foundation; either
  15    version 3 of the License, or (at your option) any later version.
  16
  17    This library is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  24 */
  25
  26 #include "tdb_private.h"
  27
  28 /*
  29   transaction design:
  30
  31   - only allow a single transaction at a time per database. This makes
  32     using the transaction API simpler, as otherwise the caller would
  33     have to cope with temporary failures in transactions that conflict
  34     with other current transactions
  35
  36   - keep the transaction recovery information in the same file as the
  37     database, using a special 'transaction recovery' record pointed at
  38     by the header. This removes the need for extra journal files as
  39     used by some other databases
  40
  41   - dynamically allocated the transaction recover record, re-using it
  42     for subsequent transactions. If a larger record is needed then
  43     tdb_free() the old record to place it on the normal tdb freelist
  44     before allocating the new record
  45
  46   - during transactions, keep a linked list of all writes that have
  47     been performed by intercepting all tdb_write() calls. The hooked
  48     transaction versions of tdb_read() and tdb_write() check this
  49     linked list and try to use the elements of the list in preference
  50     to the real database.
  51
  52   - don't allow any locks to be held when a transaction starts,
  53     otherwise we can end up with deadlock (plus lack of lock nesting
  54     in posix locks would mean the lock is lost)
  55
  56   - if the caller gains a lock during the transaction but doesn't
  57     release it then fail the commit
  58
  59   - allow for nested calls to tdb_transaction_start(), re-using the
  60     existing transaction record. If the inner transaction is cancelled
  61     then a subsequent commit will fail
  62
  63   - keep a mirrored copy of the tdb hash chain heads to allow for the
  64     fast hash heads scan on traverse, updating the mirrored copy in
  65     the transaction version of tdb_write
  66
  67   - allow callers to mix transaction and non-transaction use of tdb,
  68     although once a transaction is started then an exclusive lock is
  69     gained until the transaction is committed or cancelled
  70
  71   - the commit stategy involves first saving away all modified data
  72     into a linearised buffer in the transaction recovery area, then
  73     marking the transaction recovery area with a magic value to
  74     indicate a valid recovery record. In total 4 fsync/msync calls are
  75     needed per commit to prevent race conditions. It might be possible
  76     to reduce this to 3 or even 2 with some more work.
  77
  78   - check for a valid recovery record on open of the tdb, while the
  79     open lock is held. Automatically recover from the transaction
  80     recovery area if needed, then continue with the open as
  81     usual. This allows for smooth crash recovery with no administrator
  82     intervention.
  83
  84   - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
  85     still available, but no fsync/msync calls are made.  This means we
  86     are still proof against a process dying during transaction commit,
  87     but not against machine reboot.
  88
  89   - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
  90     tdb_add_flags() transaction nesting is enabled.
  91     It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
  92     The default is that transaction nesting is allowed.
  93     Note: this default may change in future versions of tdb.
  94
  95     Beware. when transactions are nested a transaction successfully
  96     completed with tdb_transaction_commit() can be silently unrolled later.
  97
  98   - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
  99     tdb_add_flags() transaction nesting is disabled.
 100     It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
 101     An attempt create a nested transaction will fail with TDB_ERR_NESTING.
 102     The default is that transaction nesting is allowed.
 103     Note: this default may change in future versions of tdb.
 104 */
 105
 106
 107 /*
 108   hold the context of any current transaction
 109 */
 110 struct tdb_transaction {
 111         /* we keep a mirrored copy of the tdb hash heads here so
 112            tdb_next_hash_chain() can operate efficiently */
 113         uint32_t *hash_heads;
 114
 115         /* the original io methods - used to do IOs to the real db */
 116         const struct tdb_methods *io_methods;
 117
 118         /* the list of transaction blocks. When a block is first
 119            written to, it gets created in this list */
 120         uint8_t **blocks;
 121         uint32_t num_blocks;
 122         uint32_t block_size;      /* bytes in each block */
 123         uint32_t last_block_size; /* number of valid bytes in the last block */
 124
 125         /* non-zero when an internal transaction error has
 126            occurred. All write operations will then fail until the
 127            transaction is ended */
 128         int transaction_error;
 129
 130         /* when inside a transaction we need to keep track of any
 131            nested tdb_transaction_start() calls, as these are allowed,
 132            but don't create a new transaction */
 133         int nesting;
 134
 135         /* set when a prepare has already occurred */
 136         bool prepared;
 137         tdb_off_t magic_offset;
 138
 139         /* old file size before transaction */
 140         tdb_len_t old_map_size;
 141
 142         /* did we expand in this transaction */
 143         bool expanded;
 144 };
 145
 146
 147 /*
 148   read while in a transaction. We need to check first if the data is in our list
 149   of transaction elements, then if not do a real read
 150 */
 151 static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 152                             tdb_len_t len, int cv)
 153 {
 154         uint32_t blk;
 155
 156         /* break it down into block sized ops */
 157         while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
 158                 tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
 159                 if (transaction_read(tdb, off, buf, len2, cv) != 0) {
 160                         return -1;
 161                 }
 162                 len -= len2;
 163                 off += len2;
 164                 buf = (void *)(len2 + (char *)buf);
 165         }
 166
 167         if (len == 0) {
 168                 return 0;
 169         }
 170
 171         blk = off / tdb->transaction->block_size;
 172
 173         /* see if we have it in the block list */
 174         if (tdb->transaction->num_blocks <= blk ||
 175             tdb->transaction->blocks[blk] == NULL) {
 176                 /* nope, do a real read */
 177                 if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
 178                         goto fail;
 179                 }
 180                 return 0;
 181         }
 182
 183         /* it is in the block list. Now check for the last block */
 184         if (blk == tdb->transaction->num_blocks-1) {
 185                 if (len > tdb->transaction->last_block_size) {
 186                         goto fail;
 187                 }
 188         }
 189
 190         /* now copy it out of this block */
 191         memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
 192         if (cv) {
 193                 tdb_convert(buf, len);
 194         }
 195         return 0;
 196
 197 fail:
 198         TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%u len=%u\n", off, len));
 199         tdb->ecode = TDB_ERR_IO;
 200         tdb->transaction->transaction_error = 1;
 201         return -1;
 202 }
 203
 204
 205 /*
 206   write while in a transaction
 207 */
 208 static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 209                              const void *buf, tdb_len_t len)
 210 {
 211         uint32_t blk;
 212
 213         if (buf == NULL) {
 214                 return -1;
 215         }
 216
 217         /* Only a commit is allowed on a prepared transaction */
 218         if (tdb->transaction->prepared) {
 219                 tdb->ecode = TDB_ERR_EINVAL;
 220                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
 221                 tdb->transaction->transaction_error = 1;
 222                 return -1;
 223         }
 224
 225         /* if the write is to a hash head, then update the transaction
 226            hash heads */
 227         if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
 228             off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
 229                 uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
 230                 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
 231         }
 232
 233         /* break it up into block sized chunks */
 234         while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
 235                 tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
 236                 if (transaction_write(tdb, off, buf, len2) != 0) {
 237                         return -1;
 238                 }
 239                 len -= len2;
 240                 off += len2;
 241                 buf = (const void *)(len2 + (const char *)buf);
 242         }
 243
 244         if (len == 0) {
 245                 return 0;
 246         }
 247
 248         blk = off / tdb->transaction->block_size;
 249         off = off % tdb->transaction->block_size;
 250
 251         if (tdb->transaction->num_blocks <= blk) {
 252                 uint8_t **new_blocks;
 253                 /* expand the blocks array */
 254                 new_blocks = (uint8_t **)realloc(tdb->transaction->blocks,
 255                                                  (blk+1)*sizeof(uint8_t *));
 256                 if (new_blocks == NULL) {
 257                         tdb->ecode = TDB_ERR_OOM;
 258                         goto fail;
 259                 }
 260                 memset(&new_blocks[tdb->transaction->num_blocks], 0,
 261                        (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
 262                 tdb->transaction->blocks = new_blocks;
 263                 tdb->transaction->num_blocks = blk+1;
 264                 tdb->transaction->last_block_size = 0;
 265         }
 266
 267         /* allocate and fill a block? */
 268         if (tdb->transaction->blocks[blk] == NULL) {
 269                 tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
 270                 if (tdb->transaction->blocks[blk] == NULL) {
 271                         tdb->ecode = TDB_ERR_OOM;
 272                         tdb->transaction->transaction_error = 1;
 273                         return -1;
 274                 }
 275                 if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
 276                         tdb_len_t len2 = tdb->transaction->block_size;
 277                         if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
 278                                 len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
 279                         }
 280                         if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size,
 281                                                                    tdb->transaction->blocks[blk],
 282                                                                    len2, 0) != 0) {
 283                                 SAFE_FREE(tdb->transaction->blocks[blk]);
 284                                 tdb->ecode = TDB_ERR_IO;
 285                                 goto fail;
 286                         }
 287                         if (blk == tdb->transaction->num_blocks-1) {
 288                                 tdb->transaction->last_block_size = len2;
 289                         }
 290                 }
 291         }
 292
 293         /* overwrite part of an existing block */
 294         memcpy(tdb->transaction->blocks[blk] + off, buf, len);
 295         if (blk == tdb->transaction->num_blocks-1) {
 296                 if (len + off > tdb->transaction->last_block_size) {
 297                         tdb->transaction->last_block_size = len + off;
 298                 }
 299         }
 300
 301         return 0;
 302
 303 fail:
 304         TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%u len=%u\n",
 305                  (blk*tdb->transaction->block_size) + off, len));
 306         tdb->transaction->transaction_error = 1;
 307         return -1;
 308 }
 309
 310
 311 /*
 312   write while in a transaction - this variant never expands the transaction blocks, it only
 313   updates existing blocks. This means it cannot change the recovery size
 314 */
 315 static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
 316                                       const void *buf, tdb_len_t len)
 317 {
 318         uint32_t blk;
 319
 320         /* break it up into block sized chunks */
 321         while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
 322                 tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
 323                 if (transaction_write_existing(tdb, off, buf, len2) != 0) {
 324                         return -1;
 325                 }
 326                 len -= len2;
 327                 off += len2;
 328                 if (buf != NULL) {
 329                         buf = (const void *)(len2 + (const char *)buf);
 330                 }
 331         }
 332
 333         if (len == 0) {
 334                 return 0;
 335         }
 336
 337         blk = off / tdb->transaction->block_size;
 338         off = off % tdb->transaction->block_size;
 339
 340         if (tdb->transaction->num_blocks <= blk ||
 341             tdb->transaction->blocks[blk] == NULL) {
 342                 return 0;
 343         }
 344
 345         if (blk == tdb->transaction->num_blocks-1 &&
 346             off + len > tdb->transaction->last_block_size) {
 347                 if (off >= tdb->transaction->last_block_size) {
 348                         return 0;
 349                 }
 350                 len = tdb->transaction->last_block_size - off;
 351         }
 352
 353         /* overwrite part of an existing block */
 354         memcpy(tdb->transaction->blocks[blk] + off, buf, len);
 355
 356         return 0;
 357 }
 358
 359
 360 /*
 361   accelerated hash chain head search, using the cached hash heads
 362 */
 363 static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
 364 {
 365         uint32_t h = *chain;
 366         for (;h < tdb->hash_size;h++) {
 367                 /* the +1 takes account of the freelist */
 368                 if (0 != tdb->transaction->hash_heads[h+1]) {
 369                         break;
 370                 }
 371         }
 372         (*chain) = h;
 373 }
 374
 375 /*
 376   out of bounds check during a transaction
 377 */
 378 static int transaction_oob(struct tdb_context *tdb, tdb_off_t off,
 379                            tdb_len_t len, int probe)
 380 {
 381         if (off + len >= off && off + len <= tdb->map_size) {
 382                 return 0;
 383         }
 384         tdb->ecode = TDB_ERR_IO;
 385         return -1;
 386 }
 387
 388 /*
 389   transaction version of tdb_expand().
 390 */
 391 static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
 392                                    tdb_off_t addition)
 393 {
 394         const char buf_zero[8192] = {0};
 395         size_t buf_len = sizeof(buf_zero);
 396
 397         while (addition > 0) {
 398                 size_t n = MIN(addition, buf_len);
 399                 int ret;
 400
 401                 ret = transaction_write(tdb, size, buf_zero, n);
 402                 if (ret != 0) {
 403                         return ret;
 404                 }
 405
 406                 addition -= n;
 407                 size += n;
 408         }
 409
 410         tdb->transaction->expanded = true;
 411
 412         return 0;
 413 }
 414
 415 static const struct tdb_methods transaction_methods = {
 416         transaction_read,
 417         transaction_write,
 418         transaction_next_hash_chain,
 419         transaction_oob,
 420         transaction_expand_file,
 421 };
 422
 423 /*
 424  * Is a transaction currently active on this context?
 425  *
 426  */
 427 _PUBLIC_ bool tdb_transaction_active(struct tdb_context *tdb)
 428 {
 429         return (tdb->transaction != NULL);
 430 }
 431
 432 /*
 433   start a tdb transaction. No token is returned, as only a single
 434   transaction is allowed to be pending per tdb_context
 435 */
 436 static int _tdb_transaction_start(struct tdb_context *tdb,
 437                                   enum tdb_lock_flags lockflags)
 438 {
 439         /* some sanity checks */
 440         if (tdb->read_only || (tdb->flags & TDB_INTERNAL)
 441             || tdb->traverse_read) {
 442                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
 443                 tdb->ecode = TDB_ERR_EINVAL;
 444                 return -1;
 445         }
 446
 447         /* cope with nested tdb_transaction_start() calls */
 448         if (tdb->transaction != NULL) {
 449                 if (!(tdb->flags & TDB_ALLOW_NESTING)) {
 450                         tdb->ecode = TDB_ERR_NESTING;
 451                         return -1;
 452                 }
 453                 tdb->transaction->nesting++;
 454                 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
 455                          tdb->transaction->nesting));
 456                 return 0;
 457         }
 458
 459         if (tdb_have_extra_locks(tdb)) {
 460                 /* the caller must not have any locks when starting a
 461                    transaction as otherwise we'll be screwed by lack
 462                    of nested locks in posix */
 463                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
 464                 tdb->ecode = TDB_ERR_LOCK;
 465                 return -1;
 466         }
 467
 468         if (tdb->travlocks.next != NULL) {
 469                 /* you cannot use transactions inside a traverse (although you can use
 470                    traverse inside a transaction) as otherwise you can end up with
 471                    deadlock */
 472                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
 473                 tdb->ecode = TDB_ERR_LOCK;
 474                 return -1;
 475         }
 476
 477         tdb->transaction = (struct tdb_transaction *)
 478                 calloc(sizeof(struct tdb_transaction), 1);
 479         if (tdb->transaction == NULL) {
 480                 tdb->ecode = TDB_ERR_OOM;
 481                 return -1;
 482         }
 483
 484         /* a page at a time seems like a reasonable compromise between compactness and efficiency */
 485         tdb->transaction->block_size = tdb->page_size;
 486
 487         /* get the transaction write lock. This is a blocking lock. As
 488            discussed with Volker, there are a number of ways we could
 489            make this async, which we will probably do in the future */
 490         if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) {
 491                 SAFE_FREE(tdb->transaction->blocks);
 492                 SAFE_FREE(tdb->transaction);
 493                 if ((lockflags & TDB_LOCK_WAIT) == 0) {
 494                         tdb->ecode = TDB_ERR_NOLOCK;
 495                 } else {
 496                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 497                                  "tdb_transaction_start: "
 498                                  "failed to get transaction lock\n"));
 499                 }
 500                 return -1;
 501         }
 502
 503         /* get a read lock from the freelist to the end of file. This
 504            is upgraded to a write lock during the commit */
 505         if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
 506                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
 507                 goto fail_allrecord_lock;
 508         }
 509
 510         /* setup a copy of the hash table heads so the hash scan in
 511            traverse can be fast */
 512         tdb->transaction->hash_heads = (uint32_t *)
 513                 calloc(tdb->hash_size+1, sizeof(uint32_t));
 514         if (tdb->transaction->hash_heads == NULL) {
 515                 tdb->ecode = TDB_ERR_OOM;
 516                 goto fail;
 517         }
 518         if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
 519                                    TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
 520                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
 521                 tdb->ecode = TDB_ERR_IO;
 522                 goto fail;
 523         }
 524
 525         /* make sure we know about any file expansions already done by
 526            anyone else */
 527         tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
 528         tdb->transaction->old_map_size = tdb->map_size;
 529
 530         /* finally hook the io methods, replacing them with
 531            transaction specific methods */
 532         tdb->transaction->io_methods = tdb->methods;
 533         tdb->methods = &transaction_methods;
 534
 535         /* Trace at the end, so we get sequence number correct. */
 536         tdb_trace(tdb, "tdb_transaction_start");
 537         return 0;
 538
 539 fail:
 540         tdb_allrecord_unlock(tdb, F_RDLCK, false);
 541 fail_allrecord_lock:
 542         tdb_transaction_unlock(tdb, F_WRLCK);
 543         SAFE_FREE(tdb->transaction->blocks);
 544         SAFE_FREE(tdb->transaction->hash_heads);
 545         SAFE_FREE(tdb->transaction);
 546         return -1;
 547 }
 548
 549 _PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb)
 550 {
 551         return _tdb_transaction_start(tdb, TDB_LOCK_WAIT);
 552 }
 553
 554 _PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb)
 555 {
 556         return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
 557 }
 558
 559 /*
 560   sync to disk
 561 */
 562 static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
 563 {
 564         if (tdb->flags & TDB_NOSYNC) {
 565                 return 0;
 566         }
 567
 568 #ifdef HAVE_FDATASYNC
 569         if (fdatasync(tdb->fd) != 0) {
 570 #else
 571         if (fsync(tdb->fd) != 0) {
 572 #endif
 573                 tdb->ecode = TDB_ERR_IO;
 574                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
 575                 return -1;
 576         }
 577 #ifdef HAVE_MMAP
 578         if (tdb->map_ptr) {
 579                 tdb_off_t moffset = offset & ~(tdb->page_size-1);
 580                 if (msync(moffset + (char *)tdb->map_ptr,
 581                           length + (offset - moffset), MS_SYNC) != 0) {
 582                         tdb->ecode = TDB_ERR_IO;
 583                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
 584                                  strerror(errno)));
 585                         return -1;
 586                 }
 587         }
 588 #endif
 589         return 0;
 590 }
 591
 592
 593 static int _tdb_transaction_cancel(struct tdb_context *tdb)
 594 {
 595         uint32_t i;
 596         int ret = 0;
 597
 598         if (tdb->transaction == NULL) {
 599                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
 600                 return -1;
 601         }
 602
 603         if (tdb->transaction->nesting != 0) {
 604                 tdb->transaction->transaction_error = 1;
 605                 tdb->transaction->nesting--;
 606                 return 0;
 607         }
 608
 609         tdb->map_size = tdb->transaction->old_map_size;
 610
 611         /* free all the transaction blocks */
 612         for (i=0;i<tdb->transaction->num_blocks;i++) {
 613                 if (tdb->transaction->blocks[i] != NULL) {
 614                         free(tdb->transaction->blocks[i]);
 615                 }
 616         }
 617         SAFE_FREE(tdb->transaction->blocks);
 618
 619         if (tdb->transaction->magic_offset) {
 620                 const struct tdb_methods *methods = tdb->transaction->io_methods;
 621                 const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC;
 622
 623                 /* remove the recovery marker */
 624                 if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
 625                 transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
 626                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
 627                         ret = -1;
 628                 }
 629         }
 630
 631         /* This also removes the OPEN_LOCK, if we have it. */
 632         tdb_release_transaction_locks(tdb);
 633
 634         /* restore the normal io methods */
 635         tdb->methods = tdb->transaction->io_methods;
 636
 637         SAFE_FREE(tdb->transaction->hash_heads);
 638         SAFE_FREE(tdb->transaction);
 639
 640         return ret;
 641 }
 642
 643 /*
 644   cancel the current transaction
 645 */
 646 _PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb)
 647 {
 648         tdb_trace(tdb, "tdb_transaction_cancel");
 649         return _tdb_transaction_cancel(tdb);
 650 }
 651
 652 /*
 653   work out how much space the linearised recovery data will consume
 654 */
 655 static bool tdb_recovery_size(struct tdb_context *tdb, tdb_len_t *result)
 656 {
 657         tdb_len_t recovery_size = 0;
 658         uint32_t i;
 659
 660         recovery_size = sizeof(uint32_t);
 661         for (i=0;i<tdb->transaction->num_blocks;i++) {
 662                 tdb_len_t block_size;
 663                 if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
 664                         break;
 665                 }
 666                 if (tdb->transaction->blocks[i] == NULL) {
 667                         continue;
 668                 }
 669                 if (!tdb_add_len_t(recovery_size, 2*sizeof(tdb_off_t),
 670                                    &recovery_size)) {
 671                         return false;
 672                 }
 673                 if (i == tdb->transaction->num_blocks-1) {
 674                         block_size = tdb->transaction->last_block_size;
 675                 } else {
 676                         block_size =  tdb->transaction->block_size;
 677                 }
 678                 if (!tdb_add_len_t(recovery_size, block_size,
 679                                    &recovery_size)) {
 680                         return false;
 681                 }
 682         }
 683
 684         *result = recovery_size;
 685         return true;
 686 }
 687
 688 int tdb_recovery_area(struct tdb_context *tdb,
 689                       const struct tdb_methods *methods,
 690                       tdb_off_t *recovery_offset,
 691                       struct tdb_record *rec)
 692 {
 693         int ret;
 694
 695         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) {
 696                 return -1;
 697         }
 698
 699         if (*recovery_offset == 0) {
 700                 rec->rec_len = 0;
 701                 return 0;
 702         }
 703
 704         if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec),
 705                               DOCONV()) == -1) {
 706                 return -1;
 707         }
 708
 709         /* ignore invalid recovery regions: can happen in crash */
 710         if (rec->magic != TDB_RECOVERY_MAGIC &&
 711             rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
 712                 *recovery_offset = 0;
 713                 rec->rec_len = 0;
 714         }
 715
 716         ret = methods->tdb_oob(tdb, *recovery_offset, rec->rec_len, 1);
 717         if (ret == -1) {
 718                 *recovery_offset = 0;
 719                 rec->rec_len = 0;
 720         }
 721
 722         return 0;
 723 }
 724
 725 /*
 726   allocate the recovery area, or use an existing recovery area if it is
 727   large enough
 728 */
 729 static int tdb_recovery_allocate(struct tdb_context *tdb,
 730                                  tdb_len_t *recovery_size,
 731                                  tdb_off_t *recovery_offset,
 732                                  tdb_len_t *recovery_max_size)
 733 {
 734         struct tdb_record rec;
 735         const struct tdb_methods *methods = tdb->transaction->io_methods;
 736         tdb_off_t recovery_head, new_end;
 737
 738         if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
 739                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
 740                 return -1;
 741         }
 742
 743         if (!tdb_recovery_size(tdb, recovery_size)) {
 744                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: "
 745                          "overflow recovery size\n"));
 746                 return -1;
 747         }
 748
 749         /* Existing recovery area? */
 750         if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
 751                 /* it fits in the existing area */
 752                 *recovery_max_size = rec.rec_len;
 753                 *recovery_offset = recovery_head;
 754                 return 0;
 755         }
 756
 757         /* If recovery area in middle of file, we need a new one. */
 758         if (recovery_head == 0
 759             || recovery_head + sizeof(rec) + rec.rec_len != tdb->map_size) {
 760                 /* we need to free up the old recovery area, then allocate a
 761                    new one at the end of the file. Note that we cannot use
 762                    tdb_allocate() to allocate the new one as that might return
 763                    us an area that is being currently used (as of the start of
 764                    the transaction) */
 765                 if (recovery_head) {
 766                         if (tdb_free(tdb, recovery_head, &rec) == -1) {
 767                                 TDB_LOG((tdb, TDB_DEBUG_FATAL,
 768                                          "tdb_recovery_allocate: failed to"
 769                                          " free previous recovery area\n"));
 770                                 return -1;
 771                         }
 772
 773                         /* the tdb_free() call might have increased
 774                          * the recovery size */
 775                         if (!tdb_recovery_size(tdb, recovery_size)) {
 776                                 TDB_LOG((tdb, TDB_DEBUG_FATAL,
 777                                          "tdb_recovery_allocate: "
 778                                          "overflow recovery size\n"));
 779                                 return -1;
 780                         }
 781                 }
 782
 783                 /* New head will be at end of file. */
 784                 recovery_head = tdb->map_size;
 785         }
 786
 787         /* Now we know where it will be. */
 788         *recovery_offset = recovery_head;
 789
 790         /* Expand by more than we need, so we don't do it often. */
 791         *recovery_max_size = tdb_expand_adjust(tdb->map_size,
 792                                                *recovery_size,
 793                                                tdb->page_size)
 794                 - sizeof(rec);
 795
 796         if (!tdb_add_off_t(recovery_head, sizeof(rec), &new_end) ||
 797             !tdb_add_off_t(new_end, *recovery_max_size, &new_end)) {
 798                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: "
 799                          "overflow recovery area\n"));
 800                 return -1;
 801         }
 802
 803         if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
 804                                      new_end - tdb->transaction->old_map_size)
 805             == -1) {
 806                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
 807                 return -1;
 808         }
 809
 810         /* remap the file (if using mmap) */
 811         methods->tdb_oob(tdb, tdb->map_size, 1, 1);
 812
 813         /* we have to reset the old map size so that we don't try to expand the file
 814            again in the transaction commit, which would destroy the recovery area */
 815         tdb->transaction->old_map_size = tdb->map_size;
 816
 817         /* write the recovery header offset and sync - we can sync without a race here
 818            as the magic ptr in the recovery record has not been set */
 819         CONVERT(recovery_head);
 820         if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
 821                                &recovery_head, sizeof(tdb_off_t)) == -1) {
 822                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
 823                 return -1;
 824         }
 825         if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
 826                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
 827                 return -1;
 828         }
 829
 830         return 0;
 831 }
 832
 833
 834 /*
 835   setup the recovery data that will be used on a crash during commit
 836 */
 837 static int transaction_setup_recovery(struct tdb_context *tdb,
 838                                       tdb_off_t *magic_offset)
 839 {
 840         tdb_len_t recovery_size;
 841         unsigned char *data, *p;
 842         const struct tdb_methods *methods = tdb->transaction->io_methods;
 843         struct tdb_record *rec;
 844         tdb_off_t recovery_offset, recovery_max_size;
 845         tdb_off_t old_map_size = tdb->transaction->old_map_size;
 846         uint32_t magic, tailer;
 847         uint32_t i;
 848
 849         /*
 850           check that the recovery area has enough space
 851         */
 852         if (tdb_recovery_allocate(tdb, &recovery_size,
 853                                   &recovery_offset, &recovery_max_size) == -1) {
 854                 return -1;
 855         }
 856
 857         data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 858         if (data == NULL) {
 859                 tdb->ecode = TDB_ERR_OOM;
 860                 return -1;
 861         }
 862
 863         rec = (struct tdb_record *)data;
 864         memset(rec, 0, sizeof(*rec));
 865
 866         rec->magic    = TDB_RECOVERY_INVALID_MAGIC;
 867         rec->data_len = recovery_size;
 868         rec->rec_len  = recovery_max_size;
 869         rec->key_len  = old_map_size;
 870         CONVERT(*rec);
 871
 872         /* build the recovery data into a single blob to allow us to do a single
 873            large write, which should be more efficient */
 874         p = data + sizeof(*rec);
 875         for (i=0;i<tdb->transaction->num_blocks;i++) {
 876                 tdb_off_t offset;
 877                 tdb_len_t length;
 878
 879                 if (tdb->transaction->blocks[i] == NULL) {
 880                         continue;
 881                 }
 882
 883                 offset = i * tdb->transaction->block_size;
 884                 length = tdb->transaction->block_size;
 885                 if (i == tdb->transaction->num_blocks-1) {
 886                         length = tdb->transaction->last_block_size;
 887                 }
 888
 889                 if (offset >= old_map_size) {
 890                         continue;
 891                 }
 892                 if (offset + length > tdb->transaction->old_map_size) {
 893                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
 894                         free(data);
 895                         tdb->ecode = TDB_ERR_CORRUPT;
 896                         return -1;
 897                 }
 898                 memcpy(p, &offset, 4);
 899                 memcpy(p+4, &length, 4);
 900                 if (DOCONV()) {
 901                         tdb_convert(p, 8);
 902                 }
 903                 /* the recovery area contains the old data, not the
 904                    new data, so we have to call the original tdb_read
 905                    method to get it */
 906                 if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
 907                         free(data);
 908                         tdb->ecode = TDB_ERR_IO;
 909                         return -1;
 910                 }
 911                 p += 8 + length;
 912         }
 913
 914         /* and the tailer */
 915         tailer = sizeof(*rec) + recovery_max_size;
 916         memcpy(p, &tailer, 4);
 917         if (DOCONV()) {
 918                 tdb_convert(p, 4);
 919         }
 920
 921         /* write the recovery data to the recovery area */
 922         if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
 923                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
 924                 free(data);
 925                 tdb->ecode = TDB_ERR_IO;
 926                 return -1;
 927         }
 928         if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
 929                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
 930                 free(data);
 931                 tdb->ecode = TDB_ERR_IO;
 932                 return -1;
 933         }
 934
 935         /* as we don't have ordered writes, we have to sync the recovery
 936            data before we update the magic to indicate that the recovery
 937            data is present */
 938         if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
 939                 free(data);
 940                 return -1;
 941         }
 942
 943         free(data);
 944
 945         magic = TDB_RECOVERY_MAGIC;
 946         CONVERT(magic);
 947
 948         *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
 949
 950         if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
 951                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
 952                 tdb->ecode = TDB_ERR_IO;
 953                 return -1;
 954         }
 955         if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
 956                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
 957                 tdb->ecode = TDB_ERR_IO;
 958                 return -1;
 959         }
 960
 961         /* ensure the recovery magic marker is on disk */
 962         if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
 963                 return -1;
 964         }
 965
 966         return 0;
 967 }
 968
 969 static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 970 {
 971         const struct tdb_methods *methods;
 972
 973         if (tdb->transaction == NULL) {
 974                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
 975                 return -1;
 976         }
 977
 978         if (tdb->transaction->prepared) {
 979                 tdb->ecode = TDB_ERR_EINVAL;
 980                 _tdb_transaction_cancel(tdb);
 981                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
 982                 return -1;
 983         }
 984
 985         if (tdb->transaction->transaction_error) {
 986                 tdb->ecode = TDB_ERR_IO;
 987                 _tdb_transaction_cancel(tdb);
 988                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
 989                 return -1;
 990         }
 991
 992
 993         if (tdb->transaction->nesting != 0) {
 994                 return 0;
 995         }
 996
 997         /* check for a null transaction */
 998         if (tdb->transaction->blocks == NULL) {
 999                 return 0;
1000         }
1001
1002         methods = tdb->transaction->io_methods;
1003
1004         /* if there are any locks pending then the caller has not
1005            nested their locks properly, so fail the transaction */
1006         if (tdb_have_extra_locks(tdb)) {
1007                 tdb->ecode = TDB_ERR_LOCK;
1008                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
1009                 _tdb_transaction_cancel(tdb);
1010                 return -1;
1011         }
1012
1013         /* upgrade the main transaction lock region to a write lock */
1014         if (tdb_allrecord_upgrade(tdb) == -1) {
1015                 if (tdb->ecode == TDB_ERR_RDONLY && tdb->read_only) {
1016                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
1017                                  "tdb_transaction_prepare_commit: "
1018                                  "failed to upgrade hash locks: "
1019                                  "database is read only\n"));
1020                 } else if (tdb->ecode == TDB_ERR_RDONLY
1021                            && tdb->traverse_read) {
1022                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
1023                                  "tdb_transaction_prepare_commit: "
1024                                  "failed to upgrade hash locks: "
1025                                  "a database traverse is in progress\n"));
1026                 } else {
1027                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
1028                                  "tdb_transaction_prepare_commit: "
1029                                  "failed to upgrade hash locks: %s\n",
1030                                  tdb_errorstr(tdb)));
1031                 }
1032                 _tdb_transaction_cancel(tdb);
1033                 return -1;
1034         }
1035
1036         /* get the open lock - this prevents new users attaching to the database
1037            during the commit */
1038         if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
1039                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n"));
1040                 _tdb_transaction_cancel(tdb);
1041                 return -1;
1042         }
1043
1044         /* write the recovery data to the end of the file */
1045         if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
1046                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
1047                 _tdb_transaction_cancel(tdb);
1048                 return -1;
1049         }
1050
1051         tdb->transaction->prepared = true;
1052
1053         /* expand the file to the new size if needed */
1054         if (tdb->map_size != tdb->transaction->old_map_size) {
1055                 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
1056                                              tdb->map_size -
1057                                              tdb->transaction->old_map_size) == -1) {
1058                         tdb->ecode = TDB_ERR_IO;
1059                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
1060                         _tdb_transaction_cancel(tdb);
1061                         return -1;
1062                 }
1063                 tdb->map_size = tdb->transaction->old_map_size;
1064                 methods->tdb_oob(tdb, tdb->map_size, 1, 1);
1065         }
1066
1067         /* Keep the open lock until the actual commit */
1068
1069         return 0;
1070 }
1071
1072 /*
1073    prepare to commit the current transaction
1074 */
1075 _PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb)
1076 {
1077         tdb_trace(tdb, "tdb_transaction_prepare_commit");
1078         return _tdb_transaction_prepare_commit(tdb);
1079 }
1080
1081 /* A repack is worthwhile if the largest is less than half total free. */
1082 static bool repack_worthwhile(struct tdb_context *tdb)
1083 {
1084         tdb_off_t ptr;
1085         struct tdb_record rec;
1086         tdb_len_t total = 0, largest = 0;
1087
1088         if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) {
1089                 return false;
1090         }
1091
1092         while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) {
1093                 total += rec.rec_len;
1094                 if (rec.rec_len > largest) {
1095                         largest = rec.rec_len;
1096                 }
1097                 ptr = rec.next;
1098         }
1099
1100         return total > largest * 2;
1101 }
1102
1103 /*
1104   commit the current transaction
1105 */
1106 _PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb)
1107 {
1108         const struct tdb_methods *methods;
1109         uint32_t i;
1110         bool need_repack = false;
1111
1112         if (tdb->transaction == NULL) {
1113                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
1114                 return -1;
1115         }
1116
1117         tdb_trace(tdb, "tdb_transaction_commit");
1118
1119         if (tdb->transaction->transaction_error) {
1120                 tdb->ecode = TDB_ERR_IO;
1121                 _tdb_transaction_cancel(tdb);
1122                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
1123                 return -1;
1124         }
1125
1126
1127         if (tdb->transaction->nesting != 0) {
1128                 tdb->transaction->nesting--;
1129                 return 0;
1130         }
1131
1132         /* check for a null transaction */
1133         if (tdb->transaction->blocks == NULL) {
1134                 _tdb_transaction_cancel(tdb);
1135                 return 0;
1136         }
1137
1138         if (!tdb->transaction->prepared) {
1139                 int ret = _tdb_transaction_prepare_commit(tdb);
1140                 if (ret)
1141                         return ret;
1142         }
1143
1144         methods = tdb->transaction->io_methods;
1145
1146         /* perform all the writes */
1147         for (i=0;i<tdb->transaction->num_blocks;i++) {
1148                 tdb_off_t offset;
1149                 tdb_len_t length;
1150
1151                 if (tdb->transaction->blocks[i] == NULL) {
1152                         continue;
1153                 }
1154
1155                 offset = i * tdb->transaction->block_size;
1156                 length = tdb->transaction->block_size;
1157                 if (i == tdb->transaction->num_blocks-1) {
1158                         length = tdb->transaction->last_block_size;
1159                 }
1160
1161                 if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
1162                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
1163
1164                         /* we've overwritten part of the data and
1165                            possibly expanded the file, so we need to
1166                            run the crash recovery code */
1167                         tdb->methods = methods;
1168                         tdb_transaction_recover(tdb);
1169
1170                         _tdb_transaction_cancel(tdb);
1171
1172                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
1173                         return -1;
1174                 }
1175                 SAFE_FREE(tdb->transaction->blocks[i]);
1176         }
1177
1178         /* Do this before we drop lock or blocks. */
1179         if (tdb->transaction->expanded) {
1180                 need_repack = repack_worthwhile(tdb);
1181         }
1182
1183         SAFE_FREE(tdb->transaction->blocks);
1184         tdb->transaction->num_blocks = 0;
1185
1186         /* ensure the new data is on disk */
1187         if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1188                 return -1;
1189         }
1190
1191         /*
1192           TODO: maybe write to some dummy hdr field, or write to magic
1193           offset without mmap, before the last sync, instead of the
1194           utime() call
1195         */
1196
1197         /* on some systems (like Linux 2.6.x) changes via mmap/msync
1198            don't change the mtime of the file, this means the file may
1199            not be backed up (as tdb rounding to block sizes means that
1200            file size changes are quite rare too). The following forces
1201            mtime changes when a transaction completes */
1202 #ifdef HAVE_UTIME
1203         utime(tdb->name, NULL);
1204 #endif
1205
1206         /* use a transaction cancel to free memory and remove the
1207            transaction locks */
1208         _tdb_transaction_cancel(tdb);
1209
1210         if (need_repack) {
1211                 return tdb_repack(tdb);
1212         }
1213
1214         return 0;
1215 }
1216
1217
1218 /*
1219   recover from an aborted transaction. Must be called with exclusive
1220   database write access already established (including the open
1221   lock to prevent new processes attaching)
1222 */
1223 int tdb_transaction_recover(struct tdb_context *tdb)
1224 {
1225         tdb_off_t recovery_head, recovery_eof;
1226         unsigned char *data, *p;
1227         uint32_t zero = 0;
1228         struct tdb_record rec;
1229
1230         /* find the recovery area */
1231         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1232                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
1233                 tdb->ecode = TDB_ERR_IO;
1234                 return -1;
1235         }
1236
1237         if (recovery_head == 0) {
1238                 /* we have never allocated a recovery record */
1239                 return 0;
1240         }
1241
1242         /* read the recovery record */
1243         if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
1244                                    sizeof(rec), DOCONV()) == -1) {
1245                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
1246                 tdb->ecode = TDB_ERR_IO;
1247                 return -1;
1248         }
1249
1250         if (rec.magic != TDB_RECOVERY_MAGIC) {
1251                 /* there is no valid recovery data */
1252                 return 0;
1253         }
1254
1255         if (tdb->read_only) {
1256                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
1257                 tdb->ecode = TDB_ERR_CORRUPT;
1258                 return -1;
1259         }
1260
1261         recovery_eof = rec.key_len;
1262
1263         data = (unsigned char *)malloc(rec.data_len);
1264         if (data == NULL) {
1265                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
1266                 tdb->ecode = TDB_ERR_OOM;
1267                 return -1;
1268         }
1269
1270         /* read the full recovery data */
1271         if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
1272                                    rec.data_len, 0) == -1) {
1273                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
1274                 tdb->ecode = TDB_ERR_IO;
1275                 return -1;
1276         }
1277
1278         /* recover the file data */
1279         p = data;
1280         while (p+8 < data + rec.data_len) {
1281                 uint32_t ofs, len;
1282                 if (DOCONV()) {
1283                         tdb_convert(p, 8);
1284                 }
1285                 memcpy(&ofs, p, 4);
1286                 memcpy(&len, p+4, 4);
1287
1288                 if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
1289                         free(data);
1290                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %u bytes at offset %u\n", len, ofs));
1291                         tdb->ecode = TDB_ERR_IO;
1292                         return -1;
1293                 }
1294                 p += 8 + len;
1295         }
1296
1297         free(data);
1298
1299         if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1300                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
1301                 tdb->ecode = TDB_ERR_IO;
1302                 return -1;
1303         }
1304
1305         /* if the recovery area is after the recovered eof then remove it */
1306         if (recovery_eof <= recovery_head) {
1307                 if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
1308                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
1309                         tdb->ecode = TDB_ERR_IO;
1310                         return -1;
1311                 }
1312         }
1313
1314         /* remove the recovery magic */
1315         if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
1316                           &zero) == -1) {
1317                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
1318                 tdb->ecode = TDB_ERR_IO;
1319                 return -1;
1320         }
1321
1322         if (transaction_sync(tdb, 0, recovery_eof) == -1) {
1323                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
1324                 tdb->ecode = TDB_ERR_IO;
1325                 return -1;
1326         }
1327
1328         TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %u byte database\n",
1329                  recovery_eof));
1330
1331         /* all done */
1332         return 0;
1333 }
1334
1335 /* Any I/O failures we say "needs recovery". */
1336 bool tdb_needs_recovery(struct tdb_context *tdb)
1337 {
1338         tdb_off_t recovery_head;
1339         struct tdb_record rec;
1340
1341         /* find the recovery area */
1342         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1343                 return true;
1344         }
1345
1346         if (recovery_head == 0) {
1347                 /* we have never allocated a recovery record */
1348                 return false;
1349         }
1350
1351         /* read the recovery record */
1352         if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
1353                                    sizeof(rec), DOCONV()) == -1) {
1354                 return true;
1355         }
1356
1357         return (rec.magic == TDB_RECOVERY_MAGIC);
1358 }