lib/tdb/common/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ TDB_DATA tdb_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb->transaction != NULL) {
  63                 tdb_increment_seqnum_nonblock(tdb);
  64                 return;
  65         }
  66
  67         if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
  68                           TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  69                 return;
  70         }
  71
  72         tdb_increment_seqnum_nonblock(tdb);
  73
  74         tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
  75 }
  76
  77 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  78 {
  79         return memcmp(data.dptr, key.dptr, data.dsize);
  80 }
  81
  82 /* Returns 0 on fail.  On success, return offset of record, and fills
  83    in rec */
  84 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  85                         struct tdb_record *r)
  86 {
  87         tdb_off_t rec_ptr;
  88
  89         /* read in the hash top */
  90         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  91                 return 0;
  92
  93         /* keep looking until we find the right record */
  94         while (rec_ptr) {
  95                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
  96                         return 0;
  97
  98                 if (!TDB_DEAD(r) && hash==r->full_hash
  99                     && key.dsize==r->key_len
 100                     && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
 101                                       r->key_len, tdb_key_compare,
 102                                       NULL) == 0) {
 103                         return rec_ptr;
 104                 }
 105                 /* detect tight infinite loop */
 106                 if (rec_ptr == r->next) {
 107                         tdb->ecode = TDB_ERR_CORRUPT;
 108                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
 109                         return 0;
 110                 }
 111                 rec_ptr = r->next;
 112         }
 113         tdb->ecode = TDB_ERR_NOEXIST;
 114         return 0;
 115 }
 116
 117 /* As tdb_find, but if you succeed, keep the lock */
 118 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 119                            struct tdb_record *rec)
 120 {
 121         uint32_t rec_ptr;
 122
 123         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 124                 return 0;
 125         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 126                 tdb_unlock(tdb, BUCKET(hash), locktype);
 127         return rec_ptr;
 128 }
 129
 130 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
 131
 132 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
 133 {
 134         TDB_DATA *dbuf = (TDB_DATA *)private_data;
 135
 136         if (dbuf->dsize != data.dsize) {
 137                 return -1;
 138         }
 139         if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
 140                 return -1;
 141         }
 142         return 0;
 143 }
 144
 145 /* update an entry in place - this only works if the new data size
 146    is <= the old data size and the key exists.
 147    on failure return -1.
 148 */
 149 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 150 {
 151         struct tdb_record rec;
 152         tdb_off_t rec_ptr;
 153
 154         /* find entry */
 155         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 156                 return -1;
 157
 158         /* it could be an exact duplicate of what is there - this is
 159          * surprisingly common (eg. with a ldb re-index). */
 160         if (rec.data_len == dbuf.dsize &&
 161             tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
 162                 return 0;
 163         }
 164
 165         /* must be long enough key, data and tailer */
 166         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
 167                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 168                 return -1;
 169         }
 170
 171         if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 172                       dbuf.dptr, dbuf.dsize) == -1)
 173                 return -1;
 174
 175         if (dbuf.dsize != rec.data_len) {
 176                 /* update size */
 177                 rec.data_len = dbuf.dsize;
 178                 return tdb_rec_write(tdb, rec_ptr, &rec);
 179         }
 180
 181         return 0;
 182 }
 183
 184 /* find an entry in the database given a key */
 185 /* If an entry doesn't exist tdb_err will be set to
 186  * TDB_ERR_NOEXIST. If a key has no data attached
 187  * then the TDB_DATA will have zero length but
 188  * a non-zero pointer
 189  */
 190 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 191 {
 192         tdb_off_t rec_ptr;
 193         struct tdb_record rec;
 194         TDB_DATA ret;
 195         uint32_t hash;
 196
 197         /* find which hash bucket it is in */
 198         hash = tdb->hash_fn(&key);
 199         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 200                 return tdb_null;
 201
 202         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 203                                   rec.data_len);
 204         ret.dsize = rec.data_len;
 205         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 206         return ret;
 207 }
 208
 209 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 210 {
 211         TDB_DATA ret = _tdb_fetch(tdb, key);
 212
 213         tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
 214         return ret;
 215 }
 216
 217 /*
 218  * Find an entry in the database and hand the record's data to a parsing
 219  * function. The parsing function is executed under the chain read lock, so it
 220  * should be fast and should not block on other syscalls.
 221  *
 222  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 223  *
 224  * For mmapped tdb's that do not have a transaction open it points the parsing
 225  * function directly at the mmap area, it avoids the malloc/memcpy in this
 226  * case. If a transaction is open or no mmap is available, it has to do
 227  * malloc/read/parse/free.
 228  *
 229  * This is interesting for all readers of potentially large data structures in
 230  * the tdb records, ldb indexes being one example.
 231  *
 232  * Return -1 if the record was not found.
 233  */
 234
 235 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
 236                      int (*parser)(TDB_DATA key, TDB_DATA data,
 237                                    void *private_data),
 238                      void *private_data)
 239 {
 240         tdb_off_t rec_ptr;
 241         struct tdb_record rec;
 242         int ret;
 243         uint32_t hash;
 244
 245         /* find which hash bucket it is in */
 246         hash = tdb->hash_fn(&key);
 247
 248         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 249                 /* record not found */
 250                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
 251                 tdb->ecode = TDB_ERR_NOEXIST;
 252                 return -1;
 253         }
 254         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 255
 256         ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 257                              rec.data_len, parser, private_data);
 258
 259         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 260
 261         return ret;
 262 }
 263
 264 /* check if an entry in the database exists
 265
 266    note that 1 is returned if the key is found and 0 is returned if not found
 267    this doesn't match the conventions in the rest of this module, but is
 268    compatible with gdbm
 269 */
 270 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 271 {
 272         struct tdb_record rec;
 273
 274         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 275                 return 0;
 276         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 277         return 1;
 278 }
 279
 280 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
 281 {
 282         uint32_t hash = tdb->hash_fn(&key);
 283         int ret;
 284
 285         ret = tdb_exists_hash(tdb, key, hash);
 286         tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
 287         return ret;
 288 }
 289
 290 /* actually delete an entry in the database given the offset */
 291 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
 292 {
 293         tdb_off_t last_ptr, i;
 294         struct tdb_record lastrec;
 295
 296         if (tdb->read_only || tdb->traverse_read) return -1;
 297
 298         if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
 299             tdb_write_lock_record(tdb, rec_ptr) == -1) {
 300                 /* Someone traversing here: mark it as dead */
 301                 rec->magic = TDB_DEAD_MAGIC;
 302                 return tdb_rec_write(tdb, rec_ptr, rec);
 303         }
 304         if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
 305                 return -1;
 306
 307         /* find previous record in hash chain */
 308         if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 309                 return -1;
 310         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 311                 if (tdb_rec_read(tdb, i, &lastrec) == -1)
 312                         return -1;
 313
 314         /* unlink it: next ptr is at start of record. */
 315         if (last_ptr == 0)
 316                 last_ptr = TDB_HASH_TOP(rec->full_hash);
 317         if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
 318                 return -1;
 319
 320         /* recover the space */
 321         if (tdb_free(tdb, rec_ptr, rec) == -1)
 322                 return -1;
 323         return 0;
 324 }
 325
 326 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
 327 {
 328         int res = 0;
 329         tdb_off_t rec_ptr;
 330         struct tdb_record rec;
 331
 332         /* read in the hash top */
 333         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 334                 return 0;
 335
 336         while (rec_ptr) {
 337                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
 338                         return 0;
 339
 340                 if (rec.magic == TDB_DEAD_MAGIC) {
 341                         res += 1;
 342                 }
 343                 rec_ptr = rec.next;
 344         }
 345         return res;
 346 }
 347
 348 /*
 349  * Purge all DEAD records from a hash chain
 350  */
 351 int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
 352 {
 353         int res = -1;
 354         struct tdb_record rec;
 355         tdb_off_t rec_ptr;
 356
 357         if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == -1) {
 358                 /*
 359                  * Don't block the freelist if not strictly necessary
 360                  */
 361                 return -1;
 362         }
 363
 364         /* read in the hash top */
 365         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 366                 goto fail;
 367
 368         while (rec_ptr) {
 369                 tdb_off_t next;
 370
 371                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
 372                         goto fail;
 373                 }
 374
 375                 next = rec.next;
 376
 377                 if (rec.magic == TDB_DEAD_MAGIC
 378                     && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
 379                         goto fail;
 380                 }
 381                 rec_ptr = next;
 382         }
 383         res = 0;
 384  fail:
 385         tdb_unlock(tdb, -1, F_WRLCK);
 386         return res;
 387 }
 388
 389 /* delete an entry in the database given a key */
 390 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 391 {
 392         tdb_off_t rec_ptr;
 393         struct tdb_record rec;
 394         int ret;
 395
 396         rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec);
 397         if (rec_ptr == 0) {
 398                 return -1;
 399         }
 400
 401         if (tdb->max_dead_records != 0) {
 402
 403                 uint32_t magic = TDB_DEAD_MAGIC;
 404
 405                 /*
 406                  * Allow for some dead records per hash chain, mainly for
 407                  * tdb's with a very high create/delete rate like locking.tdb.
 408                  */
 409
 410                 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
 411                         /*
 412                          * Don't let the per-chain freelist grow too large,
 413                          * delete all existing dead records
 414                          */
 415                         tdb_purge_dead(tdb, hash);
 416                 }
 417
 418                 /*
 419                  * Just mark the record as dead.
 420                  */
 421                 ret = tdb_ofs_write(
 422                         tdb, rec_ptr + offsetof(struct tdb_record, magic),
 423                         &magic);
 424         }
 425         else {
 426                 ret = tdb_do_delete(tdb, rec_ptr, &rec);
 427         }
 428
 429         if (ret == 0) {
 430                 tdb_increment_seqnum(tdb);
 431         }
 432
 433         if (tdb_unlock(tdb, BUCKET(hash), F_WRLCK) != 0)
 434                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
 435         return ret;
 436 }
 437
 438 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
 439 {
 440         uint32_t hash = tdb->hash_fn(&key);
 441         int ret;
 442
 443         ret = tdb_delete_hash(tdb, key, hash);
 444         tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
 445         return ret;
 446 }
 447
 448 /*
 449  * See if we have a dead record around with enough space
 450  */
 451 tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
 452                         struct tdb_record *r, tdb_len_t length,
 453                         tdb_off_t *p_last_ptr)
 454 {
 455         tdb_off_t rec_ptr, last_ptr;
 456         tdb_off_t best_rec_ptr = 0;
 457         tdb_off_t best_last_ptr = 0;
 458         struct tdb_record best = { .rec_len = UINT32_MAX };
 459
 460         length += sizeof(tdb_off_t); /* tailer */
 461
 462         last_ptr = TDB_HASH_TOP(hash);
 463
 464         /* read in the hash top */
 465         if (tdb_ofs_read(tdb, last_ptr, &rec_ptr) == -1)
 466                 return 0;
 467
 468         /* keep looking until we find the right record */
 469         while (rec_ptr) {
 470                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
 471                         return 0;
 472
 473                 if (TDB_DEAD(r) && (r->rec_len >= length) &&
 474                     (r->rec_len < best.rec_len)) {
 475                         best_rec_ptr = rec_ptr;
 476                         best_last_ptr = last_ptr;
 477                         best = *r;
 478                 }
 479                 last_ptr = rec_ptr;
 480                 rec_ptr = r->next;
 481         }
 482
 483         if (best.rec_len == UINT32_MAX) {
 484                 return 0;
 485         }
 486
 487         *r = best;
 488         *p_last_ptr = best_last_ptr;
 489         return best_rec_ptr;
 490 }
 491
 492 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
 493                        TDB_DATA dbuf, int flag, uint32_t hash)
 494 {
 495         struct tdb_record rec;
 496         tdb_off_t rec_ptr;
 497         tdb_len_t rec_len;
 498         int ret = -1;
 499
 500         rec_len = key.dsize + dbuf.dsize;
 501         if ((rec_len < key.dsize) || (rec_len < dbuf.dsize)) {
 502                 tdb->ecode = TDB_ERR_OOM;
 503                 goto fail;
 504         }
 505
 506         /* check for it existing, on insert. */
 507         if (flag == TDB_INSERT) {
 508                 if (tdb_exists_hash(tdb, key, hash)) {
 509                         tdb->ecode = TDB_ERR_EXISTS;
 510                         goto fail;
 511                 }
 512         } else {
 513                 /* first try in-place update, on modify or replace. */
 514                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
 515                         goto done;
 516                 }
 517                 if (tdb->ecode == TDB_ERR_NOEXIST &&
 518                     flag == TDB_MODIFY) {
 519                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
 520                          we should fail the store */
 521                         goto fail;
 522                 }
 523         }
 524         /* reset the error code potentially set by the tdb_update_hash() */
 525         tdb->ecode = TDB_SUCCESS;
 526
 527         /* delete any existing record - if it doesn't exist we don't
 528            care.  Doing this first reduces fragmentation, and avoids
 529            coalescing with `allocated' block before it's updated. */
 530         if (flag != TDB_INSERT)
 531                 tdb_delete_hash(tdb, key, hash);
 532
 533         /* we have to allocate some space */
 534         rec_ptr = tdb_allocate(tdb, hash, rec_len, &rec);
 535
 536         if (rec_ptr == 0) {
 537                 goto fail;
 538         }
 539
 540         /* Read hash top into next ptr */
 541         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 542                 goto fail;
 543
 544         rec.key_len = key.dsize;
 545         rec.data_len = dbuf.dsize;
 546         rec.full_hash = hash;
 547         rec.magic = TDB_MAGIC;
 548
 549         /* write out and point the top of the hash chain at it */
 550         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 551             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
 552                                        key.dptr, key.dsize) == -1
 553             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
 554                                        dbuf.dptr, dbuf.dsize) == -1
 555             || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 556                 /* Need to tdb_unallocate() here */
 557                 goto fail;
 558         }
 559
 560  done:
 561         ret = 0;
 562  fail:
 563         if (ret == 0) {
 564                 tdb_increment_seqnum(tdb);
 565         }
 566         return ret;
 567 }
 568
 569 /* store an element in the database, replacing any existing element
 570    with the same key
 571
 572    return 0 on success, -1 on failure
 573 */
 574 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 575 {
 576         uint32_t hash;
 577         int ret;
 578
 579         if (tdb->read_only || tdb->traverse_read) {
 580                 tdb->ecode = TDB_ERR_RDONLY;
 581                 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
 582                 return -1;
 583         }
 584
 585         /* find which hash bucket it is in */
 586         hash = tdb->hash_fn(&key);
 587         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 588                 return -1;
 589
 590         ret = _tdb_store(tdb, key, dbuf, flag, hash);
 591         tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
 592         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 593         return ret;
 594 }
 595
 596 /* Append to an entry. Create if not exist. */
 597 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 598 {
 599         uint32_t hash;
 600         TDB_DATA dbuf;
 601         int ret = -1;
 602
 603         /* find which hash bucket it is in */
 604         hash = tdb->hash_fn(&key);
 605         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 606                 return -1;
 607
 608         dbuf = _tdb_fetch(tdb, key);
 609
 610         if (dbuf.dptr == NULL) {
 611                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 612         } else {
 613                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 614                 unsigned char *new_dptr;
 615
 616                 /* realloc '0' is special: don't do that. */
 617                 if (new_len == 0)
 618                         new_len = 1;
 619                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 620                 if (new_dptr == NULL) {
 621                         free(dbuf.dptr);
 622                 }
 623                 dbuf.dptr = new_dptr;
 624         }
 625
 626         if (dbuf.dptr == NULL) {
 627                 tdb->ecode = TDB_ERR_OOM;
 628                 goto failed;
 629         }
 630
 631         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 632         dbuf.dsize += new_dbuf.dsize;
 633
 634         ret = _tdb_store(tdb, key, dbuf, 0, hash);
 635         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 636
 637 failed:
 638         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 639         SAFE_FREE(dbuf.dptr);
 640         return ret;
 641 }
 642
 643
 644 /*
 645   return the name of the current tdb file
 646   useful for external logging functions
 647 */
 648 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
 649 {
 650         return tdb->name;
 651 }
 652
 653 /*
 654   return the underlying file descriptor being used by tdb, or -1
 655   useful for external routines that want to check the device/inode
 656   of the fd
 657 */
 658 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
 659 {
 660         return tdb->fd;
 661 }
 662
 663 /*
 664   return the current logging function
 665   useful for external tdb routines that wish to log tdb errors
 666 */
 667 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
 668 {
 669         return tdb->log.log_fn;
 670 }
 671
 672
 673 /*
 674   get the tdb sequence number. Only makes sense if the writers opened
 675   with TDB_SEQNUM set. Note that this sequence number will wrap quite
 676   quickly, so it should only be used for a 'has something changed'
 677   test, not for code that relies on the count of the number of changes
 678   made. If you want a counter then use a tdb record.
 679
 680   The aim of this sequence number is to allow for a very lightweight
 681   test of a possible tdb change.
 682 */
 683 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
 684 {
 685         tdb_off_t seqnum=0;
 686
 687         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
 688         return seqnum;
 689 }
 690
 691 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
 692 {
 693         return tdb->hash_size;
 694 }
 695
 696 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
 697 {
 698         return tdb->map_size;
 699 }
 700
 701 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
 702 {
 703         return tdb->flags;
 704 }
 705
 706 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
 707 {
 708         if ((flags & TDB_ALLOW_NESTING) &&
 709             (flags & TDB_DISALLOW_NESTING)) {
 710                 tdb->ecode = TDB_ERR_NESTING;
 711                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
 712                         "allow_nesting and disallow_nesting are not allowed together!"));
 713                 return;
 714         }
 715
 716         if (flags & TDB_ALLOW_NESTING) {
 717                 tdb->flags &= ~TDB_DISALLOW_NESTING;
 718         }
 719         if (flags & TDB_DISALLOW_NESTING) {
 720                 tdb->flags &= ~TDB_ALLOW_NESTING;
 721         }
 722
 723         tdb->flags |= flags;
 724 }
 725
 726 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 727 {
 728         if ((flags & TDB_ALLOW_NESTING) &&
 729             (flags & TDB_DISALLOW_NESTING)) {
 730                 tdb->ecode = TDB_ERR_NESTING;
 731                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 732                         "allow_nesting and disallow_nesting are not allowed together!"));
 733                 return;
 734         }
 735
 736         if ((flags & TDB_NOLOCK) &&
 737             (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) &&
 738             (tdb->mutexes == NULL)) {
 739                 tdb->ecode = TDB_ERR_LOCK;
 740                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 741                          "Can not remove NOLOCK flag on mutexed databases"));
 742                 return;
 743         }
 744
 745         if (flags & TDB_ALLOW_NESTING) {
 746                 tdb->flags |= TDB_DISALLOW_NESTING;
 747         }
 748         if (flags & TDB_DISALLOW_NESTING) {
 749                 tdb->flags |= TDB_ALLOW_NESTING;
 750         }
 751
 752         tdb->flags &= ~flags;
 753 }
 754
 755
 756 /*
 757   enable sequence number handling on an open tdb
 758 */
 759 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
 760 {
 761         tdb->flags |= TDB_SEQNUM;
 762 }
 763
 764
 765 /*
 766   add a region of the file to the freelist. Length is the size of the region in bytes,
 767   which includes the free list header that needs to be added
 768  */
 769 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
 770 {
 771         struct tdb_record rec;
 772         if (length <= sizeof(rec)) {
 773                 /* the region is not worth adding */
 774                 return 0;
 775         }
 776         if (length + offset > tdb->map_size) {
 777                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
 778                 return -1;
 779         }
 780         memset(&rec,'\0',sizeof(rec));
 781         rec.rec_len = length - sizeof(rec);
 782         if (tdb_free(tdb, offset, &rec) == -1) {
 783                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
 784                 return -1;
 785         }
 786         return 0;
 787 }
 788
 789 /*
 790   wipe the entire database, deleting all records. This can be done
 791   very fast by using a allrecord lock. The entire data portion of the
 792   file becomes a single entry in the freelist.
 793
 794   This code carefully steps around the recovery area, leaving it alone
 795  */
 796 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
 797 {
 798         uint32_t i;
 799         tdb_off_t offset = 0;
 800         ssize_t data_len;
 801         tdb_off_t recovery_head;
 802         tdb_len_t recovery_size = 0;
 803
 804         if (tdb_lockall(tdb) != 0) {
 805                 return -1;
 806         }
 807
 808         tdb_trace(tdb, "tdb_wipe_all");
 809
 810         /* see if the tdb has a recovery area, and remember its size
 811            if so. We don't want to lose this as otherwise each
 812            tdb_wipe_all() in a transaction will increase the size of
 813            the tdb by the size of the recovery area */
 814         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 815                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
 816                 goto failed;
 817         }
 818
 819         if (recovery_head != 0) {
 820                 struct tdb_record rec;
 821                 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 822                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
 823                         return -1;
 824                 }
 825                 recovery_size = rec.rec_len + sizeof(rec);
 826         }
 827
 828         /* wipe the hashes */
 829         for (i=0;i<tdb->hash_size;i++) {
 830                 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
 831                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
 832                         goto failed;
 833                 }
 834         }
 835
 836         /* wipe the freelist */
 837         if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 838                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
 839                 goto failed;
 840         }
 841
 842         /* add all the rest of the file to the freelist, possibly leaving a gap
 843            for the recovery area */
 844         if (recovery_size == 0) {
 845                 /* the simple case - the whole file can be used as a freelist */
 846                 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
 847                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 848                         goto failed;
 849                 }
 850         } else {
 851                 /* we need to add two freelist entries - one on either
 852                    side of the recovery area
 853
 854                    Note that we cannot shift the recovery area during
 855                    this operation. Only the transaction.c code may
 856                    move the recovery area or we risk subtle data
 857                    corruption
 858                 */
 859                 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
 860                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 861                         goto failed;
 862                 }
 863                 /* and the 2nd free list entry after the recovery area - if any */
 864                 data_len = tdb->map_size - (recovery_head+recovery_size);
 865                 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 866                         goto failed;
 867                 }
 868         }
 869
 870         tdb_increment_seqnum_nonblock(tdb);
 871
 872         if (tdb_unlockall(tdb) != 0) {
 873                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
 874                 goto failed;
 875         }
 876
 877         return 0;
 878
 879 failed:
 880         tdb_unlockall(tdb);
 881         return -1;
 882 }
 883
 884 struct traverse_state {
 885         bool error;
 886         struct tdb_context *dest_db;
 887 };
 888
 889 /*
 890   traverse function for repacking
 891  */
 892 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 893 {
 894         struct traverse_state *state = (struct traverse_state *)private_data;
 895         if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 896                 state->error = true;
 897                 return -1;
 898         }
 899         return 0;
 900 }
 901
 902 /*
 903   repack a tdb
 904  */
 905 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
 906 {
 907         struct tdb_context *tmp_db;
 908         struct traverse_state state;
 909
 910         tdb_trace(tdb, "tdb_repack");
 911
 912         if (tdb_transaction_start(tdb) != 0) {
 913                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
 914                 return -1;
 915         }
 916
 917         tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
 918         if (tmp_db == NULL) {
 919                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
 920                 tdb_transaction_cancel(tdb);
 921                 return -1;
 922         }
 923
 924         state.error = false;
 925         state.dest_db = tmp_db;
 926
 927         if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
 928                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
 929                 tdb_transaction_cancel(tdb);
 930                 tdb_close(tmp_db);
 931                 return -1;
 932         }
 933
 934         if (state.error) {
 935                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
 936                 tdb_transaction_cancel(tdb);
 937                 tdb_close(tmp_db);
 938                 return -1;
 939         }
 940
 941         if (tdb_wipe_all(tdb) != 0) {
 942                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
 943                 tdb_transaction_cancel(tdb);
 944                 tdb_close(tmp_db);
 945                 return -1;
 946         }
 947
 948         state.error = false;
 949         state.dest_db = tdb;
 950
 951         if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 952                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
 953                 tdb_transaction_cancel(tdb);
 954                 tdb_close(tmp_db);
 955                 return -1;
 956         }
 957
 958         if (state.error) {
 959                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
 960                 tdb_transaction_cancel(tdb);
 961                 tdb_close(tmp_db);
 962                 return -1;
 963         }
 964
 965         tdb_close(tmp_db);
 966
 967         if (tdb_transaction_commit(tdb) != 0) {
 968                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
 969                 return -1;
 970         }
 971
 972         return 0;
 973 }
 974
 975 /* Even on files, we can get partial writes due to signals. */
 976 bool tdb_write_all(int fd, const void *buf, size_t count)
 977 {
 978         while (count) {
 979                 ssize_t ret;
 980                 ret = write(fd, buf, count);
 981                 if (ret < 0)
 982                         return false;
 983                 buf = (const char *)buf + ret;
 984                 count -= ret;
 985         }
 986         return true;
 987 }
 988
 989 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
 990 {
 991         tdb_off_t ret = a + b;
 992
 993         if ((ret < a) || (ret < b)) {
 994                 return false;
 995         }
 996         *pret = ret;
 997         return true;
 998 }
 999
1000 #ifdef TDB_TRACE
1001 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1002 {
1003         if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
1004                 close(tdb->tracefd);
1005                 tdb->tracefd = -1;
1006         }
1007 }
1008
1009 static void tdb_trace_start(struct tdb_context *tdb)
1010 {
1011         tdb_off_t seqnum=0;
1012         char msg[sizeof(tdb_off_t) * 4 + 1];
1013
1014         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1015         snprintf(msg, sizeof(msg), "%u ", seqnum);
1016         tdb_trace_write(tdb, msg);
1017 }
1018
1019 static void tdb_trace_end(struct tdb_context *tdb)
1020 {
1021         tdb_trace_write(tdb, "\n");
1022 }
1023
1024 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1025 {
1026         char msg[sizeof(ret) * 4 + 4];
1027         snprintf(msg, sizeof(msg), " = %i\n", ret);
1028         tdb_trace_write(tdb, msg);
1029 }
1030
1031 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1032 {
1033         char msg[20 + rec.dsize*2], *p;
1034         unsigned int i;
1035
1036         /* We differentiate zero-length records from non-existent ones. */
1037         if (rec.dptr == NULL) {
1038                 tdb_trace_write(tdb, " NULL");
1039                 return;
1040         }
1041
1042         /* snprintf here is purely cargo-cult programming. */
1043         p = msg;
1044         p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1045         for (i = 0; i < rec.dsize; i++)
1046                 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1047
1048         tdb_trace_write(tdb, msg);
1049 }
1050
1051 void tdb_trace(struct tdb_context *tdb, const char *op)
1052 {
1053         tdb_trace_start(tdb);
1054         tdb_trace_write(tdb, op);
1055         tdb_trace_end(tdb);
1056 }
1057
1058 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1059 {
1060         char msg[sizeof(tdb_off_t) * 4 + 1];
1061
1062         snprintf(msg, sizeof(msg), "%u ", seqnum);
1063         tdb_trace_write(tdb, msg);
1064         tdb_trace_write(tdb, op);
1065         tdb_trace_end(tdb);
1066 }
1067
1068 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1069                     unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1070 {
1071         char msg[128];
1072
1073         snprintf(msg, sizeof(msg),
1074                  "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1075         tdb_trace_start(tdb);
1076         tdb_trace_write(tdb, msg);
1077         tdb_trace_end(tdb);
1078 }
1079
1080 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1081 {
1082         tdb_trace_start(tdb);
1083         tdb_trace_write(tdb, op);
1084         tdb_trace_end_ret(tdb, ret);
1085 }
1086
1087 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1088 {
1089         tdb_trace_start(tdb);
1090         tdb_trace_write(tdb, op);
1091         tdb_trace_write(tdb, " =");
1092         tdb_trace_record(tdb, ret);
1093         tdb_trace_end(tdb);
1094 }
1095
1096 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1097                     TDB_DATA rec)
1098 {
1099         tdb_trace_start(tdb);
1100         tdb_trace_write(tdb, op);
1101         tdb_trace_record(tdb, rec);
1102         tdb_trace_end(tdb);
1103 }
1104
1105 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1106                         TDB_DATA rec, int ret)
1107 {
1108         tdb_trace_start(tdb);
1109         tdb_trace_write(tdb, op);
1110         tdb_trace_record(tdb, rec);
1111         tdb_trace_end_ret(tdb, ret);
1112 }
1113
1114 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1115                            TDB_DATA rec, TDB_DATA ret)
1116 {
1117         tdb_trace_start(tdb);
1118         tdb_trace_write(tdb, op);
1119         tdb_trace_record(tdb, rec);
1120         tdb_trace_write(tdb, " =");
1121         tdb_trace_record(tdb, ret);
1122         tdb_trace_end(tdb);
1123 }
1124
1125 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1126                              TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1127                              int ret)
1128 {
1129         char msg[1 + sizeof(ret) * 4];
1130
1131         snprintf(msg, sizeof(msg), " %#x", flag);
1132         tdb_trace_start(tdb);
1133         tdb_trace_write(tdb, op);
1134         tdb_trace_record(tdb, rec1);
1135         tdb_trace_record(tdb, rec2);
1136         tdb_trace_write(tdb, msg);
1137         tdb_trace_end_ret(tdb, ret);
1138 }
1139
1140 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1141                            TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1142 {
1143         tdb_trace_start(tdb);
1144         tdb_trace_write(tdb, op);
1145         tdb_trace_record(tdb, rec1);
1146         tdb_trace_record(tdb, rec2);
1147         tdb_trace_write(tdb, " =");
1148         tdb_trace_record(tdb, ret);
1149         tdb_trace_end(tdb);
1150 }
1151 #endif