source3/tdb/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3    Samba database functions
   4    Copyright (C) Andrew Tridgell              1999-2000
   5    Copyright (C) Luke Kenneth Casson Leighton      2000
   6    Copyright (C) Paul `Rusty' Russell              2000
   7    Copyright (C) Jeremy Allison                    2000
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 2 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; if not, write to the Free Software
  21    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22 */
  23 #ifdef STANDALONE
  24 #if HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <fcntl.h>
  31 #include <unistd.h>
  32 #include <string.h>
  33 #include <fcntl.h>
  34 #include <errno.h>
  35 #include <sys/mman.h>
  36 #include <sys/stat.h>
  37 #include "tdb.h"
  38 #include "spinlock.h"
  39 #else
  40 #include "includes.h"
  41 #endif
  42
  43 #define TDB_MAGIC_FOOD "TDB file\n"
  44 #define TDB_VERSION (0x26011967 + 6)
  45 #define TDB_MAGIC (0x26011999U)
  46 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  47 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  48 #define TDB_ALIGNMENT 4
  49 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  50 #define DEFAULT_HASH_SIZE 131
  51 #define TDB_PAGE_SIZE 0x2000
  52 #define FREELIST_TOP (sizeof(struct tdb_header))
  53 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  54 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  55 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  56 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  57 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  58
  59 /* NB assumes there is a local variable called "tdb" that is the
  60  * current context, also takes doubly-parenthesized print-style
  61  * argument. */
  62 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  63
  64 /* lock offsets */
  65 #define GLOBAL_LOCK 0
  66 #define ACTIVE_LOCK 4
  67
  68 #ifndef MAP_FILE
  69 #define MAP_FILE 0
  70 #endif
  71
  72 #ifndef MAP_FAILED
  73 #define MAP_FAILED ((void *)-1)
  74 #endif
  75
  76 /* free memory if the pointer is valid and zero the pointer */
  77 #ifndef SAFE_FREE
  78 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
  79 #endif
  80
  81 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
  82 TDB_DATA tdb_null;
  83
  84 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
  85 static TDB_CONTEXT *tdbs = NULL;
  86
  87 static void tdb_munmap(TDB_CONTEXT *tdb)
  88 {
  89         if (tdb->flags & TDB_INTERNAL)
  90                 return;
  91
  92 #ifdef HAVE_MMAP
  93         if (tdb->map_ptr)
  94                 munmap(tdb->map_ptr, tdb->map_size);
  95 #endif
  96         tdb->map_ptr = NULL;
  97 }
  98
  99 static void tdb_mmap(TDB_CONTEXT *tdb)
 100 {
 101         if (tdb->flags & TDB_INTERNAL)
 102                 return;
 103
 104 #ifdef HAVE_MMAP
 105         if (!(tdb->flags & TDB_NOMMAP)) {
 106                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 107                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 108                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 109
 110                 /*
 111                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 112                  */
 113
 114                 if (tdb->map_ptr == MAP_FAILED) {
 115                         tdb->map_ptr = NULL;
 116                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 117                                  tdb->map_size, strerror(errno)));
 118                 }
 119         } else {
 120                 tdb->map_ptr = NULL;
 121         }
 122 #else
 123         tdb->map_ptr = NULL;
 124 #endif
 125 }
 126
 127 /* Endian conversion: we only ever deal with 4 byte quantities */
 128 static void *convert(void *buf, u32 size)
 129 {
 130         u32 i, *p = buf;
 131         for (i = 0; i < size / 4; i++)
 132                 p[i] = TDB_BYTEREV(p[i]);
 133         return buf;
 134 }
 135 #define DOCONV() (tdb->flags & TDB_CONVERT)
 136 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 137
 138 /* the body of the database is made of one list_struct for the free space
 139    plus a separate data list for each hash value */
 140 struct list_struct {
 141         tdb_off next; /* offset of the next record in the list */
 142         tdb_len rec_len; /* total byte length of record */
 143         tdb_len key_len; /* byte length of key */
 144         tdb_len data_len; /* byte length of data */
 145         u32 full_hash; /* the full 32 bit hash of the key */
 146         u32 magic;   /* try to catch errors */
 147         /* the following union is implied:
 148                 union {
 149                         char record[rec_len];
 150                         struct {
 151                                 char key[key_len];
 152                                 char data[data_len];
 153                         }
 154                         u32 totalsize; (tailer)
 155                 }
 156         */
 157 };
 158
 159 /* a byte range locking function - return 0 on success
 160    this functions locks/unlocks 1 byte at the specified offset.
 161
 162    On error, errno is also set so that errors are passed back properly
 163    through tdb_open(). */
 164 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 165                       int rw_type, int lck_type, int probe)
 166 {
 167         struct flock fl;
 168
 169         if (tdb->flags & TDB_NOLOCK)
 170                 return 0;
 171         if (tdb->read_only) {
 172                 errno = EACCES;
 173                 return -1;
 174         }
 175
 176         fl.l_type = rw_type;
 177         fl.l_whence = SEEK_SET;
 178         fl.l_start = offset;
 179         fl.l_len = 1;
 180         fl.l_pid = 0;
 181
 182         if (fcntl(tdb->fd,lck_type,&fl) == -1) {
 183                 if (!probe) {
 184                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 185                                  tdb->fd, offset, rw_type, lck_type));
 186                 }
 187                 /* errno set by fcntl */
 188                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 189         }
 190         return 0;
 191 }
 192
 193 /* lock a list in the database. list -1 is the alloc list */
 194 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 195 {
 196         if (list < -1 || list >= (int)tdb->header.hash_size) {
 197                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 198                            list, ltype));
 199                 return -1;
 200         }
 201         if (tdb->flags & TDB_NOLOCK)
 202                 return 0;
 203
 204         /* Since fcntl locks don't nest, we do a lock for the first one,
 205            and simply bump the count for future ones */
 206         if (tdb->locked[list+1].count == 0) {
 207                 if (!tdb->read_only && tdb->header.rwlocks) {
 208                         if (tdb_spinlock(tdb, list, ltype)) {
 209                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
 210                                            list, ltype));
 211                                 return -1;
 212                         }
 213                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 214                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 215                                            list, ltype, strerror(errno)));
 216                         return -1;
 217                 }
 218                 tdb->locked[list+1].ltype = ltype;
 219         }
 220         tdb->locked[list+1].count++;
 221         return 0;
 222 }
 223
 224 /* unlock the database: returns void because it's too late for errors. */
 225 static void tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 226 {
 227         if (tdb->flags & TDB_NOLOCK)
 228                 return;
 229
 230         /* Sanity checks */
 231         if (list < -1 || list >= (int)tdb->header.hash_size)
 232                 return;
 233         if (tdb->locked[list+1].count==0)
 234                 return;
 235
 236         if (tdb->locked[list+1].count == 1) {
 237                 /* Down to last nested lock: unlock underneath */
 238                 if (!tdb->read_only && tdb->header.rwlocks)
 239                         tdb_spinunlock(tdb, list, ltype);
 240                 else
 241                         tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 242         }
 243         tdb->locked[list+1].count--;
 244 }
 245
 246 /* This is based on the hash agorithm from gdbm */
 247 static u32 tdb_hash(TDB_DATA *key)
 248 {
 249         u32 value;      /* Used to compute the hash value.  */
 250         u32   i;        /* Used to cycle through random values. */
 251
 252         /* Set the initial value from the key size. */
 253         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
 254                 value = (value + (key->dptr[i] << (i*5 % 24)));
 255
 256         return (1103515243 * value + 12345);
 257 }
 258
 259 /* check for an out of bounds access - if it is out of bounds then
 260    see if the database has been expanded by someone else and expand
 261    if necessary
 262    note that "len" is the minimum length needed for the db
 263 */
 264 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 265 {
 266         struct stat st;
 267         if (len <= tdb->map_size)
 268                 return 0;
 269         if (tdb->flags & TDB_INTERNAL) {
 270                 if (!probe) {
 271                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 272                                  (int)len, (int)tdb->map_size));
 273                 }
 274                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 275         }
 276
 277         if (fstat(tdb->fd, &st) == -1)
 278                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 279
 280         if (st.st_size < (size_t)len) {
 281                 if (!probe) {
 282                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 283                                  (int)len, (int)st.st_size));
 284                 }
 285                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 286         }
 287
 288         /* Unmap, update size, remap */
 289         tdb_munmap(tdb);
 290         tdb->map_size = st.st_size;
 291         tdb_mmap(tdb);
 292         return 0;
 293 }
 294
 295 /* write a lump of data at a specified offset */
 296 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 297 {
 298         if (tdb_oob(tdb, off + len, 0) != 0)
 299                 return -1;
 300
 301         if (tdb->map_ptr)
 302                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 303 #ifdef HAVE_PWRITE
 304         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 305 #else
 306         else if (lseek(tdb->fd, off, SEEK_SET) != off
 307                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 308 #endif
 309                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 310                            off, len, strerror(errno)));
 311                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 312         }
 313         return 0;
 314 }
 315
 316 /* read a lump of data at a specified offset, maybe convert */
 317 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 318 {
 319         if (tdb_oob(tdb, off + len, 0) != 0)
 320                 return -1;
 321
 322         if (tdb->map_ptr)
 323                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 324 #ifdef HAVE_PREAD
 325         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 326 #else
 327         else if (lseek(tdb->fd, off, SEEK_SET) != off
 328                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 329 #endif
 330                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 331                            off, len, strerror(errno)));
 332                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 333         }
 334         if (cv)
 335                 convert(buf, len);
 336         return 0;
 337 }
 338
 339 /* read a lump of data, allocating the space for it */
 340 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 341 {
 342         char *buf;
 343
 344         if (!(buf = malloc(len))) {
 345                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 346                            len, strerror(errno)));
 347                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 348         }
 349         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 350                 SAFE_FREE(buf);
 351                 return NULL;
 352         }
 353         return buf;
 354 }
 355
 356 /* read/write a tdb_off */
 357 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 358 {
 359         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 360 }
 361 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 362 {
 363         tdb_off off = *d;
 364         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 365 }
 366
 367 /* read/write a record */
 368 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 369 {
 370         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 371                 return -1;
 372         if (TDB_BAD_MAGIC(rec)) {
 373                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 374                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 375         }
 376         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 377 }
 378 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 379 {
 380         struct list_struct r = *rec;
 381         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 382 }
 383
 384 /* read a freelist record and check for simple errors */
 385 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 386 {
 387         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 388                 return -1;
 389         if (rec->magic != TDB_FREE_MAGIC) {
 390                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 391                            rec->magic, off));
 392                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 393         }
 394         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 395                 return -1;
 396         return 0;
 397 }
 398
 399 /* update a record tailer (must hold allocation lock) */
 400 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 401                          const struct list_struct *rec)
 402 {
 403         tdb_off totalsize;
 404
 405         /* Offset of tailer from record header */
 406         totalsize = sizeof(*rec) + rec->rec_len;
 407         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 408                          &totalsize);
 409 }
 410
 411 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 412 {
 413         struct list_struct rec;
 414         tdb_off tailer_ofs, tailer;
 415
 416         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 417                 printf("ERROR: failed to read record at %u\n", offset);
 418                 return 0;
 419         }
 420
 421         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 422                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 423
 424         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 425         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 426                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 427                 return rec.next;
 428         }
 429
 430         if (tailer != rec.rec_len + sizeof(rec)) {
 431                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 432                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 433         }
 434         return rec.next;
 435 }
 436
 437 static void tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 438 {
 439         tdb_off rec_ptr, top;
 440
 441         top = TDB_HASH_TOP(i);
 442
 443         tdb_lock(tdb, i, F_WRLCK);
 444
 445         if (ofs_read(tdb, top, &rec_ptr) == -1) {
 446                 tdb_unlock(tdb, i, F_WRLCK);
 447                 return;
 448         }
 449
 450         if (rec_ptr)
 451                 printf("hash=%d\n", i);
 452
 453         while (rec_ptr) {
 454                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 455         }
 456         tdb_unlock(tdb, i, F_WRLCK);
 457 }
 458
 459 void tdb_dump_all(TDB_CONTEXT *tdb)
 460 {
 461         int i;
 462         for (i=0;i<tdb->header.hash_size;i++) {
 463                 tdb_dump_chain(tdb, i);
 464         }
 465         printf("freelist:\n");
 466         tdb_dump_chain(tdb, -1);
 467 }
 468
 469 void tdb_printfreelist(TDB_CONTEXT *tdb)
 470 {
 471         long total_free = 0;
 472         tdb_off offset, rec_ptr;
 473         struct list_struct rec;
 474
 475         tdb_lock(tdb, -1, F_WRLCK);
 476
 477         offset = FREELIST_TOP;
 478
 479         /* read in the freelist top */
 480         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 481                 return;
 482         }
 483
 484         printf("freelist top=[0x%08x]\n", rec_ptr );
 485         while (rec_ptr) {
 486                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 487                         return;
 488                 }
 489
 490                 if (rec.magic != TDB_FREE_MAGIC) {
 491                         printf("bad magic 0x%08x in free list\n", rec.magic);
 492                         return;
 493                 }
 494
 495                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 496                 total_free += rec.rec_len;
 497
 498                 /* move to the next record */
 499                 rec_ptr = rec.next;
 500         }
 501         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 502                (int)total_free);
 503
 504         tdb_unlock(tdb, -1, F_WRLCK);
 505 }
 506
 507 /* Remove an element from the freelist.  Must have alloc lock. */
 508 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 509 {
 510         tdb_off last_ptr, i;
 511
 512         /* read in the freelist top */
 513         last_ptr = FREELIST_TOP;
 514         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 515                 if (i == off) {
 516                         /* We've found it! */
 517                         return ofs_write(tdb, last_ptr, &next);
 518                 }
 519                 /* Follow chain (next offset is at start of record) */
 520                 last_ptr = i;
 521         }
 522         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 523         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 524 }
 525
 526 /* Add an element into the freelist. Merge adjacent records if
 527    neccessary. */
 528 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 529 {
 530         tdb_off right, left;
 531
 532         /* Allocation and tailer lock */
 533         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 534                 return -1;
 535
 536         /* set an initial tailer, so if we fail we don't leave a bogus record */
 537         update_tailer(tdb, offset, rec);
 538
 539         /* Look right first (I'm an Australian, dammit) */
 540         right = offset + sizeof(*rec) + rec->rec_len;
 541         if (right + sizeof(*rec) <= tdb->map_size) {
 542                 struct list_struct r;
 543
 544                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 545                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 546                         goto left;
 547                 }
 548
 549                 /* If it's free, expand to include it. */
 550                 if (r.magic == TDB_FREE_MAGIC) {
 551                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 552                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 553                                 goto left;
 554                         }
 555                         rec->rec_len += sizeof(r) + r.rec_len;
 556                 }
 557         }
 558
 559 left:
 560         /* Look left */
 561         left = offset - sizeof(tdb_off);
 562         if (left > TDB_HASH_TOP(tdb->header.hash_size-1)) {
 563                 struct list_struct l;
 564                 tdb_off leftsize;
 565
 566                 /* Read in tailer and jump back to header */
 567                 if (ofs_read(tdb, left, &leftsize) == -1) {
 568                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 569                         goto update;
 570                 }
 571                 left = offset - leftsize;
 572
 573                 /* Now read in record */
 574                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 575                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 576                         goto update;
 577                 }
 578
 579                 /* If it's free, expand to include it. */
 580                 if (l.magic == TDB_FREE_MAGIC) {
 581                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 582                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 583                                 goto update;
 584                         } else {
 585                                 offset = left;
 586                                 rec->rec_len += leftsize;
 587                         }
 588                 }
 589         }
 590
 591 update:
 592         if (update_tailer(tdb, offset, rec) == -1) {
 593                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 594                 goto fail;
 595         }
 596
 597         /* Now, prepend to free list */
 598         rec->magic = TDB_FREE_MAGIC;
 599
 600         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 601             rec_write(tdb, offset, rec) == -1 ||
 602             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 603                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 604                 goto fail;
 605         }
 606
 607         /* And we're done. */
 608         tdb_unlock(tdb, -1, F_WRLCK);
 609         return 0;
 610
 611  fail:
 612         tdb_unlock(tdb, -1, F_WRLCK);
 613         return -1;
 614 }
 615
 616
 617 /* expand a file.  we prefer to use ftruncate, as that is what posix
 618   says to use for mmap expansion */
 619 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 620 {
 621         char buf[1024];
 622 #if HAVE_FTRUNCATE_EXTEND
 623         if (ftruncate(tdb->fd, size+addition) != 0) {
 624                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 625                            size+addition, strerror(errno)));
 626                 return -1;
 627         }
 628 #else
 629         char b = 0;
 630
 631 #ifdef HAVE_PWRITE
 632         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 633 #else
 634         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 635             write(tdb->fd, &b, 1) != 1) {
 636 #endif
 637                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 638                            size+addition, strerror(errno)));
 639                 return -1;
 640         }
 641 #endif
 642
 643         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 644            very bad if we ran out of disk. This must be done with write, not via mmap */
 645         memset(buf, 0x42, sizeof(buf));
 646         while (addition) {
 647                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 648 #ifdef HAVE_PWRITE
 649                 int ret = pwrite(tdb->fd, buf, n, size);
 650 #else
 651                 int ret;
 652                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 653                         return -1;
 654                 ret = write(tdb->fd, buf, n);
 655 #endif
 656                 if (ret != n) {
 657                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 658                                    n, strerror(errno)));
 659                         return -1;
 660                 }
 661                 addition -= n;
 662                 size += n;
 663         }
 664         return 0;
 665 }
 666
 667
 668 /* expand the database at least size bytes by expanding the underlying
 669    file and doing the mmap again if necessary */
 670 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 671 {
 672         struct list_struct rec;
 673         tdb_off offset;
 674
 675         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 676                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 677                 return -1;
 678         }
 679
 680         /* must know about any previous expansions by another process */
 681         tdb_oob(tdb, tdb->map_size + 1, 1);
 682
 683         /* always make room for at least 10 more records, and round
 684            the database up to a multiple of TDB_PAGE_SIZE */
 685         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 686
 687         if (!(tdb->flags & TDB_INTERNAL))
 688                 tdb_munmap(tdb);
 689
 690         /*
 691          * We must ensure the file is unmapped before doing this
 692          * to ensure consistency with systems like OpenBSD where
 693          * writes and mmaps are not consistent.
 694          */
 695
 696         /* expand the file itself */
 697         if (!(tdb->flags & TDB_INTERNAL)) {
 698                 if (expand_file(tdb, tdb->map_size, size) != 0)
 699                         goto fail;
 700         }
 701
 702         tdb->map_size += size;
 703
 704         if (tdb->flags & TDB_INTERNAL)
 705                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 706         else {
 707                 /*
 708                  * We must ensure the file is remapped before adding the space
 709                  * to ensure consistency with systems like OpenBSD where
 710                  * writes and mmaps are not consistent.
 711                  */
 712
 713                 /* We're ok if the mmap fails as we'll fallback to read/write */
 714                 tdb_mmap(tdb);
 715         }
 716
 717         /* form a new freelist record */
 718         memset(&rec,'\0',sizeof(rec));
 719         rec.rec_len = size - sizeof(rec);
 720
 721         /* link it into the free list */
 722         offset = tdb->map_size - size;
 723         if (tdb_free(tdb, offset, &rec) == -1)
 724                 goto fail;
 725
 726         tdb_unlock(tdb, -1, F_WRLCK);
 727         return 0;
 728  fail:
 729         tdb_unlock(tdb, -1, F_WRLCK);
 730         return -1;
 731 }
 732
 733 /* allocate some space from the free list. The offset returned points
 734    to a unconnected list_struct within the database with room for at
 735    least length bytes of total data
 736
 737    0 is returned if the space could not be allocated
 738  */
 739 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 740                             struct list_struct *rec)
 741 {
 742         tdb_off rec_ptr, last_ptr, newrec_ptr;
 743         struct list_struct newrec;
 744
 745         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 746                 return 0;
 747
 748         /* Extra bytes required for tailer */
 749         length += sizeof(tdb_off);
 750
 751  again:
 752         last_ptr = FREELIST_TOP;
 753
 754         /* read in the freelist top */
 755         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 756                 goto fail;
 757
 758         /* keep looking until we find a freelist record big enough */
 759         while (rec_ptr) {
 760                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 761                         goto fail;
 762
 763                 if (rec->rec_len >= length) {
 764                         /* found it - now possibly split it up  */
 765                         if (rec->rec_len > length + MIN_REC_SIZE) {
 766                                 /* Length of left piece */
 767                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 768
 769                                 /* Right piece to go on free list */
 770                                 newrec.rec_len = rec->rec_len
 771                                         - (sizeof(*rec) + length);
 772                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 773
 774                                 /* And left record is shortened */
 775                                 rec->rec_len = length;
 776                         } else
 777                                 newrec_ptr = 0;
 778
 779                         /* Remove allocated record from the free list */
 780                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 781                                 goto fail;
 782
 783                         /* Update header: do this before we drop alloc
 784                            lock, otherwise tdb_free() might try to
 785                            merge with us, thinking we're free.
 786                            (Thanks Jeremy Allison). */
 787                         rec->magic = TDB_MAGIC;
 788                         if (rec_write(tdb, rec_ptr, rec) == -1)
 789                                 goto fail;
 790
 791                         /* Did we create new block? */
 792                         if (newrec_ptr) {
 793                                 /* Update allocated record tailer (we
 794                                    shortened it). */
 795                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 796                                         goto fail;
 797
 798                                 /* Free new record */
 799                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 800                                         goto fail;
 801                         }
 802
 803                         /* all done - return the new record offset */
 804                         tdb_unlock(tdb, -1, F_WRLCK);
 805                         return rec_ptr;
 806                 }
 807                 /* move to the next record */
 808                 last_ptr = rec_ptr;
 809                 rec_ptr = rec->next;
 810         }
 811         /* we didn't find enough space. See if we can expand the
 812            database and if we can then try again */
 813         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 814                 goto again;
 815  fail:
 816         tdb_unlock(tdb, -1, F_WRLCK);
 817         return 0;
 818 }
 819
 820 /* initialise a new database with a specified hash size */
 821 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 822 {
 823         struct tdb_header *newdb;
 824         int size, ret = -1;
 825
 826         /* We make it up in memory, then write it out if not internal */
 827         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 828         if (!(newdb = calloc(size, 1)))
 829                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 830
 831         /* Fill in the header */
 832         newdb->version = TDB_VERSION;
 833         newdb->hash_size = hash_size;
 834 #ifdef USE_SPINLOCKS
 835         newdb->rwlocks = size;
 836 #endif
 837         if (tdb->flags & TDB_INTERNAL) {
 838                 tdb->map_size = size;
 839                 tdb->map_ptr = (char *)newdb;
 840                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 841                 /* Convert the `ondisk' version if asked. */
 842                 CONVERT(*newdb);
 843                 return 0;
 844         }
 845         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 846                 goto fail;
 847
 848         if (ftruncate(tdb->fd, 0) == -1)
 849                 goto fail;
 850
 851         /* This creates an endian-converted header, as if read from disk */
 852         CONVERT(*newdb);
 853         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 854         /* Don't endian-convert the magic food! */
 855         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 856         if (write(tdb->fd, newdb, size) != size)
 857                 ret = -1;
 858         else
 859                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 860
 861   fail:
 862         SAFE_FREE(newdb);
 863         return ret;
 864 }
 865
 866 /* Returns 0 on fail.  On success, return offset of record, and fills
 867    in rec */
 868 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 869                         struct list_struct *r)
 870 {
 871         tdb_off rec_ptr;
 872
 873         /* read in the hash top */
 874         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 875                 return 0;
 876
 877         /* keep looking until we find the right record */
 878         while (rec_ptr) {
 879                 if (rec_read(tdb, rec_ptr, r) == -1)
 880                         return 0;
 881
 882                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 883                         char *k;
 884                         /* a very likely hit - read the key */
 885                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
 886                                            r->key_len);
 887                         if (!k)
 888                                 return 0;
 889
 890                         if (memcmp(key.dptr, k, key.dsize) == 0) {
 891                                 SAFE_FREE(k);
 892                                 return rec_ptr;
 893                         }
 894                         SAFE_FREE(k);
 895                 }
 896                 rec_ptr = r->next;
 897         }
 898         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 899 }
 900
 901 /* If they do lockkeys, check that this hash is one they locked */
 902 static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
 903 {
 904         u32 i;
 905         if (!tdb->lockedkeys)
 906                 return 1;
 907         for (i = 0; i < tdb->lockedkeys[0]; i++)
 908                 if (tdb->lockedkeys[i+1] == hash)
 909                         return 1;
 910         return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
 911 }
 912
 913 /* As tdb_find, but if you succeed, keep the lock */
 914 static tdb_off tdb_find_lock(TDB_CONTEXT *tdb, TDB_DATA key, int locktype,
 915                              struct list_struct *rec)
 916 {
 917         u32 hash, rec_ptr;
 918
 919         hash = tdb_hash(&key);
 920         if (!tdb_keylocked(tdb, hash))
 921                 return 0;
 922         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 923                 return 0;
 924         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 925                 tdb_unlock(tdb, BUCKET(hash), locktype);
 926         return rec_ptr;
 927 }
 928
 929 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
 930 {
 931         return tdb->ecode;
 932 }
 933
 934 static struct tdb_errname {
 935         enum TDB_ERROR ecode; const char *estring;
 936 } emap[] = { {TDB_SUCCESS, "Success"},
 937              {TDB_ERR_CORRUPT, "Corrupt database"},
 938              {TDB_ERR_IO, "IO Error"},
 939              {TDB_ERR_LOCK, "Locking error"},
 940              {TDB_ERR_OOM, "Out of memory"},
 941              {TDB_ERR_EXISTS, "Record exists"},
 942              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
 943              {TDB_ERR_NOEXIST, "Record does not exist"} };
 944
 945 /* Error string for the last tdb error */
 946 const char *tdb_errorstr(TDB_CONTEXT *tdb)
 947 {
 948         u32 i;
 949         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
 950                 if (tdb->ecode == emap[i].ecode)
 951                         return emap[i].estring;
 952         return "Invalid error code";
 953 }
 954
 955 /* update an entry in place - this only works if the new data size
 956    is <= the old data size and the key exists.
 957    on failure return -1
 958 */
 959 static int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
 960 {
 961         struct list_struct rec;
 962         tdb_off rec_ptr;
 963         int ret = -1;
 964
 965         /* find entry */
 966         if (!(rec_ptr = tdb_find_lock(tdb, key, F_WRLCK, &rec)))
 967                 return -1;
 968
 969         /* must be long enough key, data and tailer */
 970         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
 971                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 972                 goto out;
 973         }
 974
 975         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 976                       dbuf.dptr, dbuf.dsize) == -1)
 977                 goto out;
 978
 979         if (dbuf.dsize != rec.data_len) {
 980                 /* update size */
 981                 rec.data_len = dbuf.dsize;
 982                 ret = rec_write(tdb, rec_ptr, &rec);
 983         } else
 984                 ret = 0;
 985  out:
 986         tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK);
 987         return ret;
 988 }
 989
 990 /* find an entry in the database given a key */
 991 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
 992 {
 993         tdb_off rec_ptr;
 994         struct list_struct rec;
 995         TDB_DATA ret;
 996
 997         /* find which hash bucket it is in */
 998         if (!(rec_ptr = tdb_find_lock(tdb,key,F_RDLCK,&rec)))
 999                 return tdb_null;
1000
1001         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1002                                   rec.data_len);
1003         ret.dsize = rec.data_len;
1004         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1005         return ret;
1006 }
1007
1008 /* check if an entry in the database exists
1009
1010    note that 1 is returned if the key is found and 0 is returned if not found
1011    this doesn't match the conventions in the rest of this module, but is
1012    compatible with gdbm
1013 */
1014 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1015 {
1016         struct list_struct rec;
1017
1018         if (tdb_find_lock(tdb, key, F_RDLCK, &rec) == 0)
1019                 return 0;
1020         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1021         return 1;
1022 }
1023
1024 /* record lock stops delete underneath */
1025 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1026 {
1027         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1028 }
1029 /*
1030   Write locks override our own fcntl readlocks, so check it here.
1031   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1032   an error to fail to get the lock here.
1033 */
1034
1035 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1036 {
1037         struct tdb_traverse_lock *i;
1038         for (i = &tdb->travlocks; i; i = i->next)
1039                 if (i->off == off)
1040                         return -1;
1041         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1042 }
1043
1044 /*
1045   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1046   an error to fail to get the lock here.
1047 */
1048
1049 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1050 {
1051         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1052 }
1053 /* fcntl locks don't stack: avoid unlocking someone else's */
1054 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1055 {
1056         struct tdb_traverse_lock *i;
1057         u32 count = 0;
1058
1059         if (off == 0)
1060                 return 0;
1061         for (i = &tdb->travlocks; i; i = i->next)
1062                 if (i->off == off)
1063                         count++;
1064         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1065 }
1066
1067 /* actually delete an entry in the database given the offset */
1068 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1069 {
1070         tdb_off last_ptr, i;
1071         struct list_struct lastrec;
1072
1073         if (tdb->read_only) return -1;
1074
1075         if (write_lock_record(tdb, rec_ptr) == -1) {
1076                 /* Someone traversing here: mark it as dead */
1077                 rec->magic = TDB_DEAD_MAGIC;
1078                 return rec_write(tdb, rec_ptr, rec);
1079         }
1080         write_unlock_record(tdb, rec_ptr);
1081
1082         /* find previous record in hash chain */
1083         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1084                 return -1;
1085         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1086                 if (rec_read(tdb, i, &lastrec) == -1)
1087                         return -1;
1088
1089         /* unlink it: next ptr is at start of record. */
1090         if (last_ptr == 0)
1091                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1092         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1093                 return -1;
1094
1095         /* recover the space */
1096         if (tdb_free(tdb, rec_ptr, rec) == -1)
1097                 return -1;
1098         return 0;
1099 }
1100
1101 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1102 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1103                          struct list_struct *rec)
1104 {
1105         int want_next = (tlock->off != 0);
1106
1107         /* No traversal allows if you've called tdb_lockkeys() */
1108         if (tdb->lockedkeys)
1109                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1110
1111         /* Lock each chain from the start one. */
1112         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1113                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1114                         return -1;
1115
1116                 /* No previous record?  Start at top of chain. */
1117                 if (!tlock->off) {
1118                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1119                                      &tlock->off) == -1)
1120                                 goto fail;
1121                 } else {
1122                         /* Otherwise unlock the previous record. */
1123                         unlock_record(tdb, tlock->off);
1124                 }
1125
1126                 if (want_next) {
1127                         /* We have offset of old record: grab next */
1128                         if (rec_read(tdb, tlock->off, rec) == -1)
1129                                 goto fail;
1130                         tlock->off = rec->next;
1131                 }
1132
1133                 /* Iterate through chain */
1134                 while( tlock->off) {
1135                         tdb_off current;
1136                         if (rec_read(tdb, tlock->off, rec) == -1)
1137                                 goto fail;
1138                         if (!TDB_DEAD(rec)) {
1139                                 /* Woohoo: we found one! */
1140                                 lock_record(tdb, tlock->off);
1141                                 return tlock->off;
1142                         }
1143                         /* Try to clean dead ones from old traverses */
1144                         current = tlock->off;
1145                         tlock->off = rec->next;
1146                         do_delete(tdb, current, rec);
1147                 }
1148                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1149                 want_next = 0;
1150         }
1151         /* We finished iteration without finding anything */
1152         return TDB_ERRCODE(TDB_SUCCESS, 0);
1153
1154  fail:
1155         tlock->off = 0;
1156         tdb_unlock(tdb, tlock->hash, F_WRLCK);
1157         return -1;
1158 }
1159
1160 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1161    return -1 on error or the record count traversed
1162    if fn is NULL then it is not called
1163    a non-zero return value from fn() indicates that the traversal should stop
1164   */
1165 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *state)
1166 {
1167         TDB_DATA key, dbuf;
1168         struct list_struct rec;
1169         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1170         int ret, count = 0;
1171
1172         /* This was in the initializaton, above, but the IRIX compiler
1173          * did not like it.  crh
1174          */
1175         tl.next = tdb->travlocks.next;
1176
1177         /* fcntl locks don't stack: beware traverse inside traverse */
1178         tdb->travlocks.next = &tl;
1179
1180         /* tdb_next_lock places locks on the record returned, and its chain */
1181         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1182                 count++;
1183                 /* now read the full record */
1184                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1185                                           rec.key_len + rec.data_len);
1186                 if (!key.dptr) {
1187                         tdb_unlock(tdb, tl.hash, F_WRLCK);
1188                         unlock_record(tdb, tl.off);
1189                         tdb->travlocks.next = tl.next;
1190                         return -1;
1191                 }
1192                 key.dsize = rec.key_len;
1193                 dbuf.dptr = key.dptr + rec.key_len;
1194                 dbuf.dsize = rec.data_len;
1195
1196                 /* Drop chain lock, call out */
1197                 tdb_unlock(tdb, tl.hash, F_WRLCK);
1198                 if (fn && fn(tdb, key, dbuf, state)) {
1199                         /* They want us to terminate traversal */
1200                         unlock_record(tdb, tl.off);
1201                         tdb->travlocks.next = tl.next;
1202                         SAFE_FREE(key.dptr);
1203                         return count;
1204                 }
1205                 SAFE_FREE(key.dptr);
1206         }
1207         tdb->travlocks.next = tl.next;
1208         if (ret < 0)
1209                 return -1;
1210         else
1211                 return count;
1212 }
1213
1214 /* find the first entry in the database and return its key */
1215 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1216 {
1217         TDB_DATA key;
1218         struct list_struct rec;
1219
1220         /* release any old lock */
1221         unlock_record(tdb, tdb->travlocks.off);
1222         tdb->travlocks.off = tdb->travlocks.hash = 0;
1223
1224         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1225                 return tdb_null;
1226         /* now read the key */
1227         key.dsize = rec.key_len;
1228         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1229         tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK);
1230         return key;
1231 }
1232
1233 /* find the next entry in the database, returning its key */
1234 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1235 {
1236         u32 oldhash;
1237         TDB_DATA key = tdb_null;
1238         struct list_struct rec;
1239         char *k = NULL;
1240
1241         /* Is locked key the old key?  If so, traverse will be reliable. */
1242         if (tdb->travlocks.off) {
1243                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1244                         return tdb_null;
1245                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1246                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1247                                             rec.key_len))
1248                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1249                         /* No, it wasn't: unlock it and start from scratch */
1250                         unlock_record(tdb, tdb->travlocks.off);
1251                         tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK);
1252                         tdb->travlocks.off = 0;
1253                 }
1254
1255                 SAFE_FREE(k);
1256         }
1257
1258         if (!tdb->travlocks.off) {
1259                 /* No previous element: do normal find, and lock record */
1260                 tdb->travlocks.off = tdb_find_lock(tdb, oldkey, F_WRLCK, &rec);
1261                 if (!tdb->travlocks.off)
1262                         return tdb_null;
1263                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1264                 lock_record(tdb, tdb->travlocks.off);
1265         }
1266         oldhash = tdb->travlocks.hash;
1267
1268         /* Grab next record: locks chain and returned record,
1269            unlocks old record */
1270         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1271                 key.dsize = rec.key_len;
1272                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1273                                           key.dsize);
1274                 /* Unlock the chain of this new record */
1275                 tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK);
1276         }
1277         /* Unlock the chain of old record */
1278         tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK);
1279         return key;
1280 }
1281
1282 /* delete an entry in the database given a key */
1283 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1284 {
1285         tdb_off rec_ptr;
1286         struct list_struct rec;
1287         int ret;
1288
1289         if (!(rec_ptr = tdb_find_lock(tdb, key, F_WRLCK, &rec)))
1290                 return -1;
1291         ret = do_delete(tdb, rec_ptr, &rec);
1292         tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK);
1293         return ret;
1294 }
1295
1296 /* store an element in the database, replacing any existing element
1297    with the same key
1298
1299    return 0 on success, -1 on failure
1300 */
1301 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1302 {
1303         struct list_struct rec;
1304         u32 hash;
1305         tdb_off rec_ptr;
1306         char *p = NULL;
1307         int ret = 0;
1308
1309         /* find which hash bucket it is in */
1310         hash = tdb_hash(&key);
1311         if (!tdb_keylocked(tdb, hash))
1312                 return -1;
1313         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1314                 return -1;
1315
1316         /* check for it existing, on insert. */
1317         if (flag == TDB_INSERT) {
1318                 if (tdb_exists(tdb, key)) {
1319                         tdb->ecode = TDB_ERR_EXISTS;
1320                         goto fail;
1321                 }
1322         } else {
1323                 /* first try in-place update, on modify or replace. */
1324                 if (tdb_update(tdb, key, dbuf) == 0)
1325                         goto out;
1326                 if (flag == TDB_MODIFY && tdb->ecode == TDB_ERR_NOEXIST)
1327                         goto fail;
1328         }
1329         /* reset the error code potentially set by the tdb_update() */
1330         tdb->ecode = TDB_SUCCESS;
1331
1332         /* delete any existing record - if it doesn't exist we don't
1333            care.  Doing this first reduces fragmentation, and avoids
1334            coalescing with `allocated' block before it's updated. */
1335         if (flag != TDB_INSERT)
1336                 tdb_delete(tdb, key);
1337
1338         /* Copy key+value *before* allocating free space in case malloc
1339            fails and we are left with a dead spot in the tdb. */
1340
1341         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1342                 tdb->ecode = TDB_ERR_OOM;
1343                 goto fail;
1344         }
1345
1346         memcpy(p, key.dptr, key.dsize);
1347         memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1348
1349         /* now we're into insert / modify / replace of a record which
1350          * we know could not be optimised by an in-place store (for
1351          * various reasons).  */
1352         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1353                 goto fail;
1354
1355         /* Read hash top into next ptr */
1356         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1357                 goto fail;
1358
1359         rec.key_len = key.dsize;
1360         rec.data_len = dbuf.dsize;
1361         rec.full_hash = hash;
1362         rec.magic = TDB_MAGIC;
1363
1364         /* write out and point the top of the hash chain at it */
1365         if (rec_write(tdb, rec_ptr, &rec) == -1
1366             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1367             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1368         fail:
1369                 /* Need to tdb_unallocate() here */
1370                 ret = -1;
1371         }
1372  out:
1373         SAFE_FREE(p);
1374         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1375         return ret;
1376 }
1377
1378 static int tdb_already_open(dev_t device,
1379                             ino_t ino)
1380 {
1381         TDB_CONTEXT *i;
1382
1383         for (i = tdbs; i; i = i->next) {
1384                 if (i->device == device && i->inode == ino) {
1385                         return 1;
1386                 }
1387         }
1388
1389         return 0;
1390 }
1391
1392 /* open the database, creating it if necessary
1393
1394    The open_flags and mode are passed straight to the open call on the
1395    database file. A flags value of O_WRONLY is invalid. The hash size
1396    is advisory, use zero for a default value.
1397
1398    Return is NULL on error, in which case errno is also set.  Don't
1399    try to call tdb_error or tdb_errname, just do strerror(errno).
1400
1401    @param name may be NULL for internal databases. */
1402 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1403                       int open_flags, mode_t mode)
1404 {
1405         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1406 }
1407
1408
1409 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1410                          int open_flags, mode_t mode,
1411                          tdb_log_func log_fn)
1412 {
1413         TDB_CONTEXT *tdb;
1414         struct stat st;
1415         int rev = 0, locked;
1416
1417         if (!(tdb = calloc(1, sizeof *tdb))) {
1418                 /* Can't log this */
1419                 errno = ENOMEM;
1420                 goto fail;
1421         }
1422         tdb->fd = -1;
1423         tdb->name = NULL;
1424         tdb->map_ptr = NULL;
1425         tdb->lockedkeys = NULL;
1426         tdb->flags = tdb_flags;
1427         tdb->open_flags = open_flags;
1428         tdb->log_fn = log_fn;
1429
1430         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1431                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1432                          name));
1433                 errno = EINVAL;
1434                 goto fail;
1435         }
1436
1437         if (hash_size == 0)
1438                 hash_size = DEFAULT_HASH_SIZE;
1439         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1440                 tdb->read_only = 1;
1441                 /* read only databases don't do locking or clear if first */
1442                 tdb->flags |= TDB_NOLOCK;
1443                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1444         }
1445
1446         /* internal databases don't mmap or lock, and start off cleared */
1447         if (tdb->flags & TDB_INTERNAL) {
1448                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1449                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1450                 tdb_new_database(tdb, hash_size);
1451                 goto internal;
1452         }
1453
1454         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1455                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1456                          name, strerror(errno)));
1457                 goto fail;      /* errno set by open(2) */
1458         }
1459
1460         /* ensure there is only one process initialising at once */
1461         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1462                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1463                          name, strerror(errno)));
1464                 goto fail;      /* errno set by tdb_brlock */
1465         }
1466
1467         /* we need to zero database if we are the only one with it open */
1468         if ((locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))
1469             && (tdb_flags & TDB_CLEAR_IF_FIRST)) {
1470                 open_flags |= O_CREAT;
1471                 if (ftruncate(tdb->fd, 0) == -1) {
1472                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1473                                  "failed to truncate %s: %s\n",
1474                                  name, strerror(errno)));
1475                         goto fail; /* errno set by ftruncate */
1476                 }
1477         }
1478
1479         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1480             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1481             || (tdb->header.version != TDB_VERSION
1482                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1483                 /* its not a valid database - possibly initialise it */
1484                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1485                         errno = EIO; /* ie bad format or something */
1486                         goto fail;
1487                 }
1488                 rev = (tdb->flags & TDB_CONVERT);
1489         }
1490         if (!rev)
1491                 tdb->flags &= ~TDB_CONVERT;
1492         else {
1493                 tdb->flags |= TDB_CONVERT;
1494                 convert(&tdb->header, sizeof(tdb->header));
1495         }
1496         if (fstat(tdb->fd, &st) == -1)
1497                 goto fail;
1498
1499         /* Is it already in the open list?  If so, fail. */
1500         if (tdb_already_open(st.st_dev, st.st_ino)) {
1501                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1502                          "%s (%d,%d) is already open in this process\n",
1503                          name, st.st_dev, st.st_ino));
1504                 errno = EBUSY;
1505                 goto fail;
1506         }
1507
1508         if (!(tdb->name = (char *)strdup(name))) {
1509                 errno = ENOMEM;
1510                 goto fail;
1511         }
1512
1513         tdb->map_size = st.st_size;
1514         tdb->device = st.st_dev;
1515         tdb->inode = st.st_ino;
1516         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1517         if (!tdb->locked) {
1518                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1519                          "failed to allocate lock structure for %s\n",
1520                          name));
1521                 errno = ENOMEM;
1522                 goto fail;
1523         }
1524         tdb_mmap(tdb);
1525         if (locked) {
1526                 if (!tdb->read_only)
1527                         tdb_clear_spinlocks(tdb);
1528                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1529                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1530                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1531                                  name, strerror(errno)));
1532                         goto fail;
1533                 }
1534         }
1535         /* leave this lock in place to indicate it's in use */
1536         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1537                 goto fail;
1538
1539  internal:
1540         /* Internal (memory-only) databases skip all the code above to
1541          * do with disk files, and resume here by releasing their
1542          * global lock and hooking into the active list. */
1543         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1544                 goto fail;
1545         tdb->next = tdbs;
1546         tdbs = tdb;
1547         return tdb;
1548
1549  fail:
1550         { int save_errno = errno;
1551
1552         if (!tdb)
1553                 return NULL;
1554
1555         if (tdb->map_ptr) {
1556                 if (tdb->flags & TDB_INTERNAL)
1557                         SAFE_FREE(tdb->map_ptr);
1558                 else
1559                         tdb_munmap(tdb);
1560         }
1561         SAFE_FREE(tdb->name);
1562         if (tdb->fd != -1)
1563                 close(tdb->fd);
1564         SAFE_FREE(tdb->locked);
1565         SAFE_FREE(tdb);
1566         errno = save_errno;
1567         return NULL;
1568         }
1569 }
1570
1571 /* close a database */
1572 int tdb_close(TDB_CONTEXT *tdb)
1573 {
1574         TDB_CONTEXT **i;
1575         int ret = 0;
1576
1577         if (tdb->map_ptr) {
1578                 if (tdb->flags & TDB_INTERNAL)
1579                         SAFE_FREE(tdb->map_ptr);
1580                 else
1581                         tdb_munmap(tdb);
1582         }
1583         SAFE_FREE(tdb->name);
1584         if (tdb->fd != -1)
1585                 ret = close(tdb->fd);
1586         SAFE_FREE(tdb->locked);
1587         SAFE_FREE(tdb->lockedkeys);
1588
1589         /* Remove from contexts list */
1590         for (i = &tdbs; *i; i = &(*i)->next) {
1591                 if (*i == tdb) {
1592                         *i = tdb->next;
1593                         break;
1594                 }
1595         }
1596
1597         memset(tdb, 0, sizeof(*tdb));
1598         SAFE_FREE(tdb);
1599
1600         return ret;
1601 }
1602
1603 /* lock/unlock entire database */
1604 int tdb_lockall(TDB_CONTEXT *tdb)
1605 {
1606         u32 i;
1607
1608         /* There are no locks on read-only dbs */
1609         if (tdb->read_only)
1610                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1611         if (tdb->lockedkeys)
1612                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1613         for (i = 0; i < tdb->header.hash_size; i++)
1614                 if (tdb_lock(tdb, i, F_WRLCK))
1615                         break;
1616
1617         /* If error, release locks we have... */
1618         if (i < tdb->header.hash_size) {
1619                 u32 j;
1620
1621                 for ( j = 0; j < i; j++)
1622                         tdb_unlock(tdb, j, F_WRLCK);
1623                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1624         }
1625
1626         return 0;
1627 }
1628 void tdb_unlockall(TDB_CONTEXT *tdb)
1629 {
1630         u32 i;
1631         for (i=0; i < tdb->header.hash_size; i++)
1632                 tdb_unlock(tdb, i, F_WRLCK);
1633 }
1634
1635 int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
1636 {
1637         u32 i, j, hash;
1638
1639         /* Can't lock more keys if already locked */
1640         if (tdb->lockedkeys)
1641                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1642         if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
1643                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
1644         /* First number in array is # keys */
1645         tdb->lockedkeys[0] = number;
1646
1647         /* Insertion sort by bucket */
1648         for (i = 0; i < number; i++) {
1649                 hash = tdb_hash(&keys[i]);
1650                 for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
1651                         memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
1652                 tdb->lockedkeys[j+1] = hash;
1653         }
1654         /* Finally, lock in order */
1655         for (i = 0; i < number; i++)
1656                 if (tdb_lock(tdb, i, F_WRLCK))
1657                         break;
1658
1659         /* If error, release locks we have... */
1660         if (i < number) {
1661                 for ( j = 0; j < i; j++)
1662                         tdb_unlock(tdb, j, F_WRLCK);
1663                 SAFE_FREE(tdb->lockedkeys);
1664                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1665         }
1666         return 0;
1667 }
1668
1669 /* Unlock the keys previously locked by tdb_lockkeys() */
1670 void tdb_unlockkeys(TDB_CONTEXT *tdb)
1671 {
1672         u32 i;
1673         for (i = 0; i < tdb->lockedkeys[0]; i++)
1674                 tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
1675         SAFE_FREE(tdb->lockedkeys);
1676 }
1677
1678 /* lock/unlock one hash chain. This is meant to be used to reduce
1679    contention - it cannot guarantee how many records will be locked */
1680 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1681 {
1682         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1683 }
1684 void tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1685 {
1686         tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1687 }
1688
1689
1690 /* register a loging function */
1691 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1692 {
1693         tdb->log_fn = fn;
1694 }
1695
1696
1697 /* reopen a tdb - this is used after a fork to ensure that we have an independent
1698    seek pointer from our parent and to re-establish locks */
1699 int tdb_reopen(TDB_CONTEXT *tdb)
1700 {
1701         struct stat st;
1702
1703         tdb_munmap(tdb);
1704         close(tdb->fd);
1705         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1706         if (tdb->fd == -1) {
1707                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1708                 goto fail;
1709         }
1710         fstat(tdb->fd, &st);
1711         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1712                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
1713                 goto fail;
1714         }
1715         tdb_mmap(tdb);
1716         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1) {
1717                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
1718                 goto fail;
1719         }
1720
1721         return 0;
1722
1723 fail:
1724         tdb_close(tdb);
1725         return -1;
1726 }
1727
1728 /* reopen all tdb's */
1729 int tdb_reopen_all(void)
1730 {
1731         TDB_CONTEXT *tdb;
1732
1733         for (tdb=tdbs; tdb; tdb = tdb->next) {
1734                 if (tdb_reopen(tdb) != 0) return -1;
1735         }
1736
1737         return 0;
1738 }