lib/tdb/common/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ TDB_DATA tdb_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
  63                           TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  64                 return;
  65         }
  66
  67         tdb_increment_seqnum_nonblock(tdb);
  68
  69         tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
  70 }
  71
  72 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  73 {
  74         return memcmp(data.dptr, key.dptr, data.dsize);
  75 }
  76
  77 /* Returns 0 on fail.  On success, return offset of record, and fills
  78    in rec */
  79 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  80                         struct tdb_record *r)
  81 {
  82         tdb_off_t rec_ptr;
  83
  84         /* read in the hash top */
  85         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  86                 return 0;
  87
  88         /* keep looking until we find the right record */
  89         while (rec_ptr) {
  90                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
  91                         return 0;
  92
  93                 if (!TDB_DEAD(r) && hash==r->full_hash
  94                     && key.dsize==r->key_len
  95                     && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
  96                                       r->key_len, tdb_key_compare,
  97                                       NULL) == 0) {
  98                         return rec_ptr;
  99                 }
 100                 /* detect tight infinite loop */
 101                 if (rec_ptr == r->next) {
 102                         tdb->ecode = TDB_ERR_CORRUPT;
 103                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
 104                         return 0;
 105                 }
 106                 rec_ptr = r->next;
 107         }
 108         tdb->ecode = TDB_ERR_NOEXIST;
 109         return 0;
 110 }
 111
 112 /* As tdb_find, but if you succeed, keep the lock */
 113 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 114                            struct tdb_record *rec)
 115 {
 116         uint32_t rec_ptr;
 117
 118         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 119                 return 0;
 120         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 121                 tdb_unlock(tdb, BUCKET(hash), locktype);
 122         return rec_ptr;
 123 }
 124
 125 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
 126
 127 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
 128 {
 129         TDB_DATA *dbuf = (TDB_DATA *)private_data;
 130
 131         if (dbuf->dsize != data.dsize) {
 132                 return -1;
 133         }
 134         if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
 135                 return -1;
 136         }
 137         return 0;
 138 }
 139
 140 /* update an entry in place - this only works if the new data size
 141    is <= the old data size and the key exists.
 142    on failure return -1.
 143 */
 144 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 145 {
 146         struct tdb_record rec;
 147         tdb_off_t rec_ptr;
 148
 149         /* find entry */
 150         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 151                 return -1;
 152
 153         /* it could be an exact duplicate of what is there - this is
 154          * surprisingly common (eg. with a ldb re-index). */
 155         if (rec.key_len == key.dsize &&
 156             rec.data_len == dbuf.dsize &&
 157             rec.full_hash == hash &&
 158             tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
 159                 return 0;
 160         }
 161
 162         /* must be long enough key, data and tailer */
 163         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
 164                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 165                 return -1;
 166         }
 167
 168         if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 169                       dbuf.dptr, dbuf.dsize) == -1)
 170                 return -1;
 171
 172         if (dbuf.dsize != rec.data_len) {
 173                 /* update size */
 174                 rec.data_len = dbuf.dsize;
 175                 return tdb_rec_write(tdb, rec_ptr, &rec);
 176         }
 177
 178         return 0;
 179 }
 180
 181 /* find an entry in the database given a key */
 182 /* If an entry doesn't exist tdb_err will be set to
 183  * TDB_ERR_NOEXIST. If a key has no data attached
 184  * then the TDB_DATA will have zero length but
 185  * a non-zero pointer
 186  */
 187 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 188 {
 189         tdb_off_t rec_ptr;
 190         struct tdb_record rec;
 191         TDB_DATA ret;
 192         uint32_t hash;
 193
 194         /* find which hash bucket it is in */
 195         hash = tdb->hash_fn(&key);
 196         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 197                 return tdb_null;
 198
 199         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 200                                   rec.data_len);
 201         ret.dsize = rec.data_len;
 202         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 203         return ret;
 204 }
 205
 206 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 207 {
 208         TDB_DATA ret = _tdb_fetch(tdb, key);
 209
 210         tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
 211         return ret;
 212 }
 213
 214 /*
 215  * Find an entry in the database and hand the record's data to a parsing
 216  * function. The parsing function is executed under the chain read lock, so it
 217  * should be fast and should not block on other syscalls.
 218  *
 219  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 220  *
 221  * For mmapped tdb's that do not have a transaction open it points the parsing
 222  * function directly at the mmap area, it avoids the malloc/memcpy in this
 223  * case. If a transaction is open or no mmap is available, it has to do
 224  * malloc/read/parse/free.
 225  *
 226  * This is interesting for all readers of potentially large data structures in
 227  * the tdb records, ldb indexes being one example.
 228  *
 229  * Return -1 if the record was not found.
 230  */
 231
 232 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
 233                      int (*parser)(TDB_DATA key, TDB_DATA data,
 234                                    void *private_data),
 235                      void *private_data)
 236 {
 237         tdb_off_t rec_ptr;
 238         struct tdb_record rec;
 239         int ret;
 240         uint32_t hash;
 241
 242         /* find which hash bucket it is in */
 243         hash = tdb->hash_fn(&key);
 244
 245         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 246                 /* record not found */
 247                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
 248                 tdb->ecode = TDB_ERR_NOEXIST;
 249                 return -1;
 250         }
 251         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 252
 253         ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 254                              rec.data_len, parser, private_data);
 255
 256         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 257
 258         return ret;
 259 }
 260
 261 /* check if an entry in the database exists
 262
 263    note that 1 is returned if the key is found and 0 is returned if not found
 264    this doesn't match the conventions in the rest of this module, but is
 265    compatible with gdbm
 266 */
 267 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 268 {
 269         struct tdb_record rec;
 270
 271         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 272                 return 0;
 273         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 274         return 1;
 275 }
 276
 277 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
 278 {
 279         uint32_t hash = tdb->hash_fn(&key);
 280         int ret;
 281
 282         ret = tdb_exists_hash(tdb, key, hash);
 283         tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
 284         return ret;
 285 }
 286
 287 /* actually delete an entry in the database given the offset */
 288 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
 289 {
 290         tdb_off_t last_ptr, i;
 291         struct tdb_record lastrec;
 292
 293         if (tdb->read_only || tdb->traverse_read) return -1;
 294
 295         if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
 296             tdb_write_lock_record(tdb, rec_ptr) == -1) {
 297                 /* Someone traversing here: mark it as dead */
 298                 rec->magic = TDB_DEAD_MAGIC;
 299                 return tdb_rec_write(tdb, rec_ptr, rec);
 300         }
 301         if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
 302                 return -1;
 303
 304         /* find previous record in hash chain */
 305         if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 306                 return -1;
 307         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 308                 if (tdb_rec_read(tdb, i, &lastrec) == -1)
 309                         return -1;
 310
 311         /* unlink it: next ptr is at start of record. */
 312         if (last_ptr == 0)
 313                 last_ptr = TDB_HASH_TOP(rec->full_hash);
 314         if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
 315                 return -1;
 316
 317         /* recover the space */
 318         if (tdb_free(tdb, rec_ptr, rec) == -1)
 319                 return -1;
 320         return 0;
 321 }
 322
 323 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
 324 {
 325         int res = 0;
 326         tdb_off_t rec_ptr;
 327         struct tdb_record rec;
 328
 329         /* read in the hash top */
 330         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 331                 return 0;
 332
 333         while (rec_ptr) {
 334                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
 335                         return 0;
 336
 337                 if (rec.magic == TDB_DEAD_MAGIC) {
 338                         res += 1;
 339                 }
 340                 rec_ptr = rec.next;
 341         }
 342         return res;
 343 }
 344
 345 /*
 346  * Purge all DEAD records from a hash chain
 347  */
 348 static int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
 349 {
 350         int res = -1;
 351         struct tdb_record rec;
 352         tdb_off_t rec_ptr;
 353
 354         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 355                 return -1;
 356         }
 357
 358         /* read in the hash top */
 359         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 360                 goto fail;
 361
 362         while (rec_ptr) {
 363                 tdb_off_t next;
 364
 365                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
 366                         goto fail;
 367                 }
 368
 369                 next = rec.next;
 370
 371                 if (rec.magic == TDB_DEAD_MAGIC
 372                     && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
 373                         goto fail;
 374                 }
 375                 rec_ptr = next;
 376         }
 377         res = 0;
 378  fail:
 379         tdb_unlock(tdb, -1, F_WRLCK);
 380         return res;
 381 }
 382
 383 /* delete an entry in the database given a key */
 384 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 385 {
 386         tdb_off_t rec_ptr;
 387         struct tdb_record rec;
 388         int ret;
 389
 390         if (tdb->max_dead_records != 0) {
 391
 392                 /*
 393                  * Allow for some dead records per hash chain, mainly for
 394                  * tdb's with a very high create/delete rate like locking.tdb.
 395                  */
 396
 397                 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 398                         return -1;
 399
 400                 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
 401                         /*
 402                          * Don't let the per-chain freelist grow too large,
 403                          * delete all existing dead records
 404                          */
 405                         tdb_purge_dead(tdb, hash);
 406                 }
 407
 408                 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
 409                         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 410                         return -1;
 411                 }
 412
 413                 /*
 414                  * Just mark the record as dead.
 415                  */
 416                 rec.magic = TDB_DEAD_MAGIC;
 417                 ret = tdb_rec_write(tdb, rec_ptr, &rec);
 418         }
 419         else {
 420                 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK,
 421                                                    &rec)))
 422                         return -1;
 423
 424                 ret = tdb_do_delete(tdb, rec_ptr, &rec);
 425         }
 426
 427         if (ret == 0) {
 428                 tdb_increment_seqnum(tdb);
 429         }
 430
 431         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
 432                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
 433         return ret;
 434 }
 435
 436 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
 437 {
 438         uint32_t hash = tdb->hash_fn(&key);
 439         int ret;
 440
 441         ret = tdb_delete_hash(tdb, key, hash);
 442         tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
 443         return ret;
 444 }
 445
 446 /*
 447  * See if we have a dead record around with enough space
 448  */
 449 static tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
 450                                struct tdb_record *r, tdb_len_t length)
 451 {
 452         tdb_off_t rec_ptr;
 453
 454         /* read in the hash top */
 455         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 456                 return 0;
 457
 458         /* keep looking until we find the right record */
 459         while (rec_ptr) {
 460                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
 461                         return 0;
 462
 463                 if (TDB_DEAD(r) && r->rec_len >= length) {
 464                         /*
 465                          * First fit for simple coding, TODO: change to best
 466                          * fit
 467                          */
 468                         return rec_ptr;
 469                 }
 470                 rec_ptr = r->next;
 471         }
 472         return 0;
 473 }
 474
 475 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
 476                        TDB_DATA dbuf, int flag, uint32_t hash)
 477 {
 478         struct tdb_record rec;
 479         tdb_off_t rec_ptr;
 480         int ret = -1;
 481
 482         /* check for it existing, on insert. */
 483         if (flag == TDB_INSERT) {
 484                 if (tdb_exists_hash(tdb, key, hash)) {
 485                         tdb->ecode = TDB_ERR_EXISTS;
 486                         goto fail;
 487                 }
 488         } else {
 489                 /* first try in-place update, on modify or replace. */
 490                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
 491                         goto done;
 492                 }
 493                 if (tdb->ecode == TDB_ERR_NOEXIST &&
 494                     flag == TDB_MODIFY) {
 495                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
 496                          we should fail the store */
 497                         goto fail;
 498                 }
 499         }
 500         /* reset the error code potentially set by the tdb_update() */
 501         tdb->ecode = TDB_SUCCESS;
 502
 503         /* delete any existing record - if it doesn't exist we don't
 504            care.  Doing this first reduces fragmentation, and avoids
 505            coalescing with `allocated' block before it's updated. */
 506         if (flag != TDB_INSERT)
 507                 tdb_delete_hash(tdb, key, hash);
 508
 509         if (tdb->max_dead_records != 0) {
 510                 /*
 511                  * Allow for some dead records per hash chain, look if we can
 512                  * find one that can hold the new record. We need enough space
 513                  * for key, data and tailer. If we find one, we don't have to
 514                  * consult the central freelist.
 515                  */
 516                 rec_ptr = tdb_find_dead(
 517                         tdb, hash, &rec,
 518                         key.dsize + dbuf.dsize + sizeof(tdb_off_t));
 519
 520                 if (rec_ptr != 0) {
 521                         rec.key_len = key.dsize;
 522                         rec.data_len = dbuf.dsize;
 523                         rec.full_hash = hash;
 524                         rec.magic = TDB_MAGIC;
 525                         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 526                             || tdb->methods->tdb_write(
 527                                     tdb, rec_ptr + sizeof(rec),
 528                                     key.dptr, key.dsize) == -1
 529                             || tdb->methods->tdb_write(
 530                                     tdb, rec_ptr + sizeof(rec) + key.dsize,
 531                                     dbuf.dptr, dbuf.dsize) == -1) {
 532                                 goto fail;
 533                         }
 534                         goto done;
 535                 }
 536         }
 537
 538         /*
 539          * We have to allocate some space from the freelist, so this means we
 540          * have to lock it. Use the chance to purge all the DEAD records from
 541          * the hash chain under the freelist lock.
 542          */
 543
 544         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 545                 goto fail;
 546         }
 547
 548         if ((tdb->max_dead_records != 0)
 549             && (tdb_purge_dead(tdb, hash) == -1)) {
 550                 tdb_unlock(tdb, -1, F_WRLCK);
 551                 goto fail;
 552         }
 553
 554         /* we have to allocate some space */
 555         rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec);
 556
 557         tdb_unlock(tdb, -1, F_WRLCK);
 558
 559         if (rec_ptr == 0) {
 560                 goto fail;
 561         }
 562
 563         /* Read hash top into next ptr */
 564         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 565                 goto fail;
 566
 567         rec.key_len = key.dsize;
 568         rec.data_len = dbuf.dsize;
 569         rec.full_hash = hash;
 570         rec.magic = TDB_MAGIC;
 571
 572         /* write out and point the top of the hash chain at it */
 573         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 574             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
 575                                        key.dptr, key.dsize) == -1
 576             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
 577                                        dbuf.dptr, dbuf.dsize) == -1
 578             || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 579                 /* Need to tdb_unallocate() here */
 580                 goto fail;
 581         }
 582
 583  done:
 584         ret = 0;
 585  fail:
 586         if (ret == 0) {
 587                 tdb_increment_seqnum(tdb);
 588         }
 589         return ret;
 590 }
 591
 592 /* store an element in the database, replacing any existing element
 593    with the same key
 594
 595    return 0 on success, -1 on failure
 596 */
 597 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 598 {
 599         uint32_t hash;
 600         int ret;
 601
 602         if (tdb->read_only || tdb->traverse_read) {
 603                 tdb->ecode = TDB_ERR_RDONLY;
 604                 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
 605                 return -1;
 606         }
 607
 608         /* find which hash bucket it is in */
 609         hash = tdb->hash_fn(&key);
 610         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 611                 return -1;
 612
 613         ret = _tdb_store(tdb, key, dbuf, flag, hash);
 614         tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
 615         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 616         return ret;
 617 }
 618
 619 /* Append to an entry. Create if not exist. */
 620 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 621 {
 622         uint32_t hash;
 623         TDB_DATA dbuf;
 624         int ret = -1;
 625
 626         /* find which hash bucket it is in */
 627         hash = tdb->hash_fn(&key);
 628         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 629                 return -1;
 630
 631         dbuf = _tdb_fetch(tdb, key);
 632
 633         if (dbuf.dptr == NULL) {
 634                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 635         } else {
 636                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 637                 unsigned char *new_dptr;
 638
 639                 /* realloc '0' is special: don't do that. */
 640                 if (new_len == 0)
 641                         new_len = 1;
 642                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 643                 if (new_dptr == NULL) {
 644                         free(dbuf.dptr);
 645                 }
 646                 dbuf.dptr = new_dptr;
 647         }
 648
 649         if (dbuf.dptr == NULL) {
 650                 tdb->ecode = TDB_ERR_OOM;
 651                 goto failed;
 652         }
 653
 654         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 655         dbuf.dsize += new_dbuf.dsize;
 656
 657         ret = _tdb_store(tdb, key, dbuf, 0, hash);
 658         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 659
 660 failed:
 661         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 662         SAFE_FREE(dbuf.dptr);
 663         return ret;
 664 }
 665
 666
 667 /*
 668   return the name of the current tdb file
 669   useful for external logging functions
 670 */
 671 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
 672 {
 673         return tdb->name;
 674 }
 675
 676 /*
 677   return the underlying file descriptor being used by tdb, or -1
 678   useful for external routines that want to check the device/inode
 679   of the fd
 680 */
 681 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
 682 {
 683         return tdb->fd;
 684 }
 685
 686 /*
 687   return the current logging function
 688   useful for external tdb routines that wish to log tdb errors
 689 */
 690 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
 691 {
 692         return tdb->log.log_fn;
 693 }
 694
 695
 696 /*
 697   get the tdb sequence number. Only makes sense if the writers opened
 698   with TDB_SEQNUM set. Note that this sequence number will wrap quite
 699   quickly, so it should only be used for a 'has something changed'
 700   test, not for code that relies on the count of the number of changes
 701   made. If you want a counter then use a tdb record.
 702
 703   The aim of this sequence number is to allow for a very lightweight
 704   test of a possible tdb change.
 705 */
 706 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
 707 {
 708         tdb_off_t seqnum=0;
 709
 710         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
 711         return seqnum;
 712 }
 713
 714 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
 715 {
 716         return tdb->hash_size;
 717 }
 718
 719 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
 720 {
 721         return tdb->map_size;
 722 }
 723
 724 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
 725 {
 726         return tdb->flags;
 727 }
 728
 729 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
 730 {
 731         if ((flags & TDB_ALLOW_NESTING) &&
 732             (flags & TDB_DISALLOW_NESTING)) {
 733                 tdb->ecode = TDB_ERR_NESTING;
 734                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
 735                         "allow_nesting and disallow_nesting are not allowed together!"));
 736                 return;
 737         }
 738
 739         if (flags & TDB_ALLOW_NESTING) {
 740                 tdb->flags &= ~TDB_DISALLOW_NESTING;
 741         }
 742         if (flags & TDB_DISALLOW_NESTING) {
 743                 tdb->flags &= ~TDB_ALLOW_NESTING;
 744         }
 745
 746         tdb->flags |= flags;
 747 }
 748
 749 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 750 {
 751         if ((flags & TDB_ALLOW_NESTING) &&
 752             (flags & TDB_DISALLOW_NESTING)) {
 753                 tdb->ecode = TDB_ERR_NESTING;
 754                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 755                         "allow_nesting and disallow_nesting are not allowed together!"));
 756                 return;
 757         }
 758
 759         if (flags & TDB_ALLOW_NESTING) {
 760                 tdb->flags |= TDB_DISALLOW_NESTING;
 761         }
 762         if (flags & TDB_DISALLOW_NESTING) {
 763                 tdb->flags |= TDB_ALLOW_NESTING;
 764         }
 765
 766         tdb->flags &= ~flags;
 767 }
 768
 769
 770 /*
 771   enable sequence number handling on an open tdb
 772 */
 773 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
 774 {
 775         tdb->flags |= TDB_SEQNUM;
 776 }
 777
 778
 779 /*
 780   add a region of the file to the freelist. Length is the size of the region in bytes,
 781   which includes the free list header that needs to be added
 782  */
 783 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
 784 {
 785         struct tdb_record rec;
 786         if (length <= sizeof(rec)) {
 787                 /* the region is not worth adding */
 788                 return 0;
 789         }
 790         if (length + offset > tdb->map_size) {
 791                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
 792                 return -1;
 793         }
 794         memset(&rec,'\0',sizeof(rec));
 795         rec.rec_len = length - sizeof(rec);
 796         if (tdb_free(tdb, offset, &rec) == -1) {
 797                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
 798                 return -1;
 799         }
 800         return 0;
 801 }
 802
 803 /*
 804   wipe the entire database, deleting all records. This can be done
 805   very fast by using a allrecord lock. The entire data portion of the
 806   file becomes a single entry in the freelist.
 807
 808   This code carefully steps around the recovery area, leaving it alone
 809  */
 810 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
 811 {
 812         int i;
 813         tdb_off_t offset = 0;
 814         ssize_t data_len;
 815         tdb_off_t recovery_head;
 816         tdb_len_t recovery_size = 0;
 817
 818         if (tdb_lockall(tdb) != 0) {
 819                 return -1;
 820         }
 821
 822         tdb_trace(tdb, "tdb_wipe_all");
 823
 824         /* see if the tdb has a recovery area, and remember its size
 825            if so. We don't want to lose this as otherwise each
 826            tdb_wipe_all() in a transaction will increase the size of
 827            the tdb by the size of the recovery area */
 828         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 829                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
 830                 goto failed;
 831         }
 832
 833         if (recovery_head != 0) {
 834                 struct tdb_record rec;
 835                 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 836                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
 837                         return -1;
 838                 }
 839                 recovery_size = rec.rec_len + sizeof(rec);
 840         }
 841
 842         /* wipe the hashes */
 843         for (i=0;i<tdb->hash_size;i++) {
 844                 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
 845                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
 846                         goto failed;
 847                 }
 848         }
 849
 850         /* wipe the freelist */
 851         if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 852                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
 853                 goto failed;
 854         }
 855
 856         /* add all the rest of the file to the freelist, possibly leaving a gap
 857            for the recovery area */
 858         if (recovery_size == 0) {
 859                 /* the simple case - the whole file can be used as a freelist */
 860                 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
 861                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 862                         goto failed;
 863                 }
 864         } else {
 865                 /* we need to add two freelist entries - one on either
 866                    side of the recovery area
 867
 868                    Note that we cannot shift the recovery area during
 869                    this operation. Only the transaction.c code may
 870                    move the recovery area or we risk subtle data
 871                    corruption
 872                 */
 873                 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
 874                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 875                         goto failed;
 876                 }
 877                 /* and the 2nd free list entry after the recovery area - if any */
 878                 data_len = tdb->map_size - (recovery_head+recovery_size);
 879                 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 880                         goto failed;
 881                 }
 882         }
 883
 884         tdb_increment_seqnum_nonblock(tdb);
 885
 886         if (tdb_unlockall(tdb) != 0) {
 887                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
 888                 goto failed;
 889         }
 890
 891         return 0;
 892
 893 failed:
 894         tdb_unlockall(tdb);
 895         return -1;
 896 }
 897
 898 struct traverse_state {
 899         bool error;
 900         struct tdb_context *dest_db;
 901 };
 902
 903 /*
 904   traverse function for repacking
 905  */
 906 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 907 {
 908         struct traverse_state *state = (struct traverse_state *)private_data;
 909         if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 910                 state->error = true;
 911                 return -1;
 912         }
 913         return 0;
 914 }
 915
 916 /*
 917   repack a tdb
 918  */
 919 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
 920 {
 921         struct tdb_context *tmp_db;
 922         struct traverse_state state;
 923
 924         tdb_trace(tdb, "tdb_repack");
 925
 926         if (tdb_transaction_start(tdb) != 0) {
 927                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
 928                 return -1;
 929         }
 930
 931         tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
 932         if (tmp_db == NULL) {
 933                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
 934                 tdb_transaction_cancel(tdb);
 935                 return -1;
 936         }
 937
 938         state.error = false;
 939         state.dest_db = tmp_db;
 940
 941         if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
 942                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
 943                 tdb_transaction_cancel(tdb);
 944                 tdb_close(tmp_db);
 945                 return -1;
 946         }
 947
 948         if (state.error) {
 949                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
 950                 tdb_transaction_cancel(tdb);
 951                 tdb_close(tmp_db);
 952                 return -1;
 953         }
 954
 955         if (tdb_wipe_all(tdb) != 0) {
 956                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
 957                 tdb_transaction_cancel(tdb);
 958                 tdb_close(tmp_db);
 959                 return -1;
 960         }
 961
 962         state.error = false;
 963         state.dest_db = tdb;
 964
 965         if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 966                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
 967                 tdb_transaction_cancel(tdb);
 968                 tdb_close(tmp_db);
 969                 return -1;
 970         }
 971
 972         if (state.error) {
 973                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
 974                 tdb_transaction_cancel(tdb);
 975                 tdb_close(tmp_db);
 976                 return -1;
 977         }
 978
 979         tdb_close(tmp_db);
 980
 981         if (tdb_transaction_commit(tdb) != 0) {
 982                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
 983                 return -1;
 984         }
 985
 986         return 0;
 987 }
 988
 989 /* Even on files, we can get partial writes due to signals. */
 990 bool tdb_write_all(int fd, const void *buf, size_t count)
 991 {
 992         while (count) {
 993                 ssize_t ret;
 994                 ret = write(fd, buf, count);
 995                 if (ret < 0)
 996                         return false;
 997                 buf = (const char *)buf + ret;
 998                 count -= ret;
 999         }
1000         return true;
1001 }
1002
1003 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
1004 {
1005         tdb_off_t ret = a + b;
1006
1007         if ((ret < a) || (ret < b)) {
1008                 return false;
1009         }
1010         *pret = ret;
1011         return true;
1012 }
1013
1014 #ifdef TDB_TRACE
1015 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1016 {
1017         if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
1018                 close(tdb->tracefd);
1019                 tdb->tracefd = -1;
1020         }
1021 }
1022
1023 static void tdb_trace_start(struct tdb_context *tdb)
1024 {
1025         tdb_off_t seqnum=0;
1026         char msg[sizeof(tdb_off_t) * 4 + 1];
1027
1028         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1029         snprintf(msg, sizeof(msg), "%u ", seqnum);
1030         tdb_trace_write(tdb, msg);
1031 }
1032
1033 static void tdb_trace_end(struct tdb_context *tdb)
1034 {
1035         tdb_trace_write(tdb, "\n");
1036 }
1037
1038 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1039 {
1040         char msg[sizeof(ret) * 4 + 4];
1041         snprintf(msg, sizeof(msg), " = %i\n", ret);
1042         tdb_trace_write(tdb, msg);
1043 }
1044
1045 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1046 {
1047         char msg[20 + rec.dsize*2], *p;
1048         unsigned int i;
1049
1050         /* We differentiate zero-length records from non-existent ones. */
1051         if (rec.dptr == NULL) {
1052                 tdb_trace_write(tdb, " NULL");
1053                 return;
1054         }
1055
1056         /* snprintf here is purely cargo-cult programming. */
1057         p = msg;
1058         p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1059         for (i = 0; i < rec.dsize; i++)
1060                 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1061
1062         tdb_trace_write(tdb, msg);
1063 }
1064
1065 void tdb_trace(struct tdb_context *tdb, const char *op)
1066 {
1067         tdb_trace_start(tdb);
1068         tdb_trace_write(tdb, op);
1069         tdb_trace_end(tdb);
1070 }
1071
1072 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1073 {
1074         char msg[sizeof(tdb_off_t) * 4 + 1];
1075
1076         snprintf(msg, sizeof(msg), "%u ", seqnum);
1077         tdb_trace_write(tdb, msg);
1078         tdb_trace_write(tdb, op);
1079         tdb_trace_end(tdb);
1080 }
1081
1082 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1083                     unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1084 {
1085         char msg[128];
1086
1087         snprintf(msg, sizeof(msg),
1088                  "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1089         tdb_trace_start(tdb);
1090         tdb_trace_write(tdb, msg);
1091         tdb_trace_end(tdb);
1092 }
1093
1094 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1095 {
1096         tdb_trace_start(tdb);
1097         tdb_trace_write(tdb, op);
1098         tdb_trace_end_ret(tdb, ret);
1099 }
1100
1101 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1102 {
1103         tdb_trace_start(tdb);
1104         tdb_trace_write(tdb, op);
1105         tdb_trace_write(tdb, " =");
1106         tdb_trace_record(tdb, ret);
1107         tdb_trace_end(tdb);
1108 }
1109
1110 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1111                     TDB_DATA rec)
1112 {
1113         tdb_trace_start(tdb);
1114         tdb_trace_write(tdb, op);
1115         tdb_trace_record(tdb, rec);
1116         tdb_trace_end(tdb);
1117 }
1118
1119 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1120                         TDB_DATA rec, int ret)
1121 {
1122         tdb_trace_start(tdb);
1123         tdb_trace_write(tdb, op);
1124         tdb_trace_record(tdb, rec);
1125         tdb_trace_end_ret(tdb, ret);
1126 }
1127
1128 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1129                            TDB_DATA rec, TDB_DATA ret)
1130 {
1131         tdb_trace_start(tdb);
1132         tdb_trace_write(tdb, op);
1133         tdb_trace_record(tdb, rec);
1134         tdb_trace_write(tdb, " =");
1135         tdb_trace_record(tdb, ret);
1136         tdb_trace_end(tdb);
1137 }
1138
1139 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1140                              TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1141                              int ret)
1142 {
1143         char msg[1 + sizeof(ret) * 4];
1144
1145         snprintf(msg, sizeof(msg), " %#x", flag);
1146         tdb_trace_start(tdb);
1147         tdb_trace_write(tdb, op);
1148         tdb_trace_record(tdb, rec1);
1149         tdb_trace_record(tdb, rec2);
1150         tdb_trace_write(tdb, msg);
1151         tdb_trace_end_ret(tdb, ret);
1152 }
1153
1154 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1155                            TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1156 {
1157         tdb_trace_start(tdb);
1158         tdb_trace_write(tdb, op);
1159         tdb_trace_record(tdb, rec1);
1160         tdb_trace_record(tdb, rec2);
1161         tdb_trace_write(tdb, " =");
1162         tdb_trace_record(tdb, ret);
1163         tdb_trace_end(tdb);
1164 }
1165 #endif