2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2004
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 2 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 /* NOTE: If you use tdbs under valgrind, and in particular if you run
31 * tdbtorture, you may get spurious "uninitialized value" warnings. I
32 * think this is because valgrind doesn't understand that the mmap'd
33 * area may be written to by other processes. Memory can, from the
34 * point of view of the grinded process, spontaneously become
37 * I can think of a few solutions. [mbp 20030311]
39 * 1 - Write suppressions for Valgrind so that it doesn't complain
40 * about this. Probably the most reasonable but people need to
41 * remember to use them.
43 * 2 - Use IO not mmap when running under valgrind. Not so nice.
45 * 3 - Use the special valgrind macros to mark memory as valid at the
46 * right time. Probably too hard -- the process just doesn't know.
71 #define TDB_MAGIC_FOOD "TDB file\n"
72 #define TDB_VERSION (0x26011967 + 6)
73 #define TDB_MAGIC (0x26011999U)
74 #define TDB_FREE_MAGIC (~TDB_MAGIC)
75 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
76 #define TDB_ALIGNMENT 4
77 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
78 #define DEFAULT_HASH_SIZE 131
79 #define TDB_PAGE_SIZE 0x2000
80 #define FREELIST_TOP (sizeof(struct tdb_header))
81 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
82 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
83 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
84 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
85 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
86 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
89 /* NB assumes there is a local variable called "tdb" that is the
90 * current context, also takes doubly-parenthesized print-style
92 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
103 #define MAP_FAILED ((void *)-1)
106 /* free memory if the pointer is valid and zero the pointer */
108 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
111 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
114 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
115 static TDB_CONTEXT *tdbs = NULL;
117 static int tdb_munmap(TDB_CONTEXT *tdb)
119 if (tdb->flags & TDB_INTERNAL)
124 int ret = munmap(tdb->map_ptr, tdb->map_size);
133 static void tdb_mmap(TDB_CONTEXT *tdb)
135 if (tdb->flags & TDB_INTERNAL)
139 if (!(tdb->flags & TDB_NOMMAP)) {
140 tdb->map_ptr = mmap(NULL, tdb->map_size,
141 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
142 MAP_SHARED|MAP_FILE, tdb->fd, 0);
145 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
148 if (tdb->map_ptr == MAP_FAILED) {
150 TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
151 tdb->map_size, strerror(errno)));
161 /* Endian conversion: we only ever deal with 4 byte quantities */
162 static void *convert(void *buf, u32 size)
165 for (i = 0; i < size / 4; i++)
166 p[i] = TDB_BYTEREV(p[i]);
169 #define DOCONV() (tdb->flags & TDB_CONVERT)
170 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
172 /* the body of the database is made of one list_struct for the free space
173 plus a separate data list for each hash value */
175 tdb_off next; /* offset of the next record in the list */
176 tdb_len rec_len; /* total byte length of record */
177 tdb_len key_len; /* byte length of key */
178 tdb_len data_len; /* byte length of data */
179 u32 full_hash; /* the full 32 bit hash of the key */
180 u32 magic; /* try to catch errors */
181 /* the following union is implied:
183 char record[rec_len];
188 u32 totalsize; (tailer)
193 /***************************************************************
194 Allow a caller to set a "alarm" flag that tdb can check to abort
195 a blocking lock on SIGALRM.
196 ***************************************************************/
198 static sig_atomic_t *palarm_fired;
200 void tdb_set_lock_alarm(sig_atomic_t *palarm)
202 palarm_fired = palarm;
205 /* a byte range locking function - return 0 on success
206 this functions locks/unlocks 1 byte at the specified offset.
208 On error, errno is also set so that errors are passed back properly
209 through tdb_open(). */
210 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
211 int rw_type, int lck_type, int probe)
216 if (tdb->flags & TDB_NOLOCK)
218 if ((rw_type == F_WRLCK) && (tdb->read_only)) {
224 fl.l_whence = SEEK_SET;
230 ret = fcntl(tdb->fd,lck_type,&fl);
231 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
233 } while (ret == -1 && errno == EINTR);
236 if (!probe && lck_type != F_SETLK) {
237 /* Ensure error code is set for log fun to examine. */
238 if (errno == EINTR && palarm_fired && *palarm_fired)
239 tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
241 tdb->ecode = TDB_ERR_LOCK;
242 TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
243 tdb->fd, offset, rw_type, lck_type));
245 /* Was it an alarm timeout ? */
246 if (errno == EINTR && palarm_fired && *palarm_fired) {
247 TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
248 tdb->fd, offset, rw_type, lck_type));
249 return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
251 /* Otherwise - generic lock error. errno set by fcntl.
252 * EAGAIN is an expected return from non-blocking
254 if (errno != EAGAIN) {
255 TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
256 tdb->fd, offset, rw_type, lck_type,
259 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
264 /* lock a list in the database. list -1 is the alloc list */
265 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
267 if (list < -1 || list >= (int)tdb->header.hash_size) {
268 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
272 if (tdb->flags & TDB_NOLOCK)
275 /* Since fcntl locks don't nest, we do a lock for the first one,
276 and simply bump the count for future ones */
277 if (tdb->locked[list+1].count == 0) {
278 if (!tdb->read_only && tdb->header.rwlocks) {
279 if (tdb_spinlock(tdb, list, ltype)) {
280 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
284 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
285 TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
286 list, ltype, strerror(errno)));
289 tdb->locked[list+1].ltype = ltype;
291 tdb->locked[list+1].count++;
295 /* unlock the database: returns void because it's too late for errors. */
296 /* changed to return int it may be interesting to know there
297 has been an error --simo */
298 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
302 if (tdb->flags & TDB_NOLOCK)
306 if (list < -1 || list >= (int)tdb->header.hash_size) {
307 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
311 if (tdb->locked[list+1].count==0) {
312 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
316 if (tdb->locked[list+1].count == 1) {
317 /* Down to last nested lock: unlock underneath */
318 if (!tdb->read_only && tdb->header.rwlocks) {
319 ret = tdb_spinunlock(tdb, list, ltype);
321 ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
326 tdb->locked[list+1].count--;
329 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
333 /* This is based on the hash algorithm from gdbm */
334 static u32 tdb_hash(TDB_DATA *key)
336 u32 value; /* Used to compute the hash value. */
337 u32 i; /* Used to cycle through random values. */
339 /* Set the initial value from the key size. */
340 for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
341 value = (value + (key->dptr[i] << (i*5 % 24)));
343 return (1103515243 * value + 12345);
346 /* check for an out of bounds access - if it is out of bounds then
347 see if the database has been expanded by someone else and expand
349 note that "len" is the minimum length needed for the db
351 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
354 if (len <= tdb->map_size)
356 if (tdb->flags & TDB_INTERNAL) {
358 /* Ensure ecode is set for log fn. */
359 tdb->ecode = TDB_ERR_IO;
360 TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
361 (int)len, (int)tdb->map_size));
363 return TDB_ERRCODE(TDB_ERR_IO, -1);
366 if (fstat(tdb->fd, &st) == -1)
367 return TDB_ERRCODE(TDB_ERR_IO, -1);
369 if (st.st_size < (size_t)len) {
371 /* Ensure ecode is set for log fn. */
372 tdb->ecode = TDB_ERR_IO;
373 TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
374 (int)len, (int)st.st_size));
376 return TDB_ERRCODE(TDB_ERR_IO, -1);
379 /* Unmap, update size, remap */
380 if (tdb_munmap(tdb) == -1)
381 return TDB_ERRCODE(TDB_ERR_IO, -1);
382 tdb->map_size = st.st_size;
387 /* write a lump of data at a specified offset */
388 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
390 if (tdb_oob(tdb, off + len, 0) != 0)
394 memcpy(off + (char *)tdb->map_ptr, buf, len);
396 else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
398 else if (lseek(tdb->fd, off, SEEK_SET) != off
399 || write(tdb->fd, buf, len) != (ssize_t)len) {
401 /* Ensure ecode is set for log fn. */
402 tdb->ecode = TDB_ERR_IO;
403 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
404 off, len, strerror(errno)));
405 return TDB_ERRCODE(TDB_ERR_IO, -1);
410 /* read a lump of data at a specified offset, maybe convert */
411 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
413 if (tdb_oob(tdb, off + len, 0) != 0)
417 memcpy(buf, off + (char *)tdb->map_ptr, len);
419 else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
421 else if (lseek(tdb->fd, off, SEEK_SET) != off
422 || read(tdb->fd, buf, len) != (ssize_t)len) {
424 /* Ensure ecode is set for log fn. */
425 tdb->ecode = TDB_ERR_IO;
426 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
427 off, len, strerror(errno)));
428 return TDB_ERRCODE(TDB_ERR_IO, -1);
435 /* read a lump of data, allocating the space for it */
436 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
440 if (!(buf = malloc(len))) {
441 /* Ensure ecode is set for log fn. */
442 tdb->ecode = TDB_ERR_OOM;
443 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
444 len, strerror(errno)));
445 return TDB_ERRCODE(TDB_ERR_OOM, buf);
447 if (tdb_read(tdb, offset, buf, len, 0) == -1) {
454 /* read/write a tdb_off */
455 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
457 return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
459 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
462 return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
465 /* read/write a record */
466 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
468 if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
470 if (TDB_BAD_MAGIC(rec)) {
471 /* Ensure ecode is set for log fn. */
472 tdb->ecode = TDB_ERR_CORRUPT;
473 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
474 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
476 return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
478 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
480 struct list_struct r = *rec;
481 return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
484 /* read a freelist record and check for simple errors */
485 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
487 if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
490 if (rec->magic == TDB_MAGIC) {
491 /* this happens when a app is showdown while deleting a record - we should
492 not completely fail when this happens */
493 TDB_LOG((tdb, 0,"rec_free_read non-free magic at offset=%d - fixing\n",
495 rec->magic = TDB_FREE_MAGIC;
496 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
500 if (rec->magic != TDB_FREE_MAGIC) {
501 /* Ensure ecode is set for log fn. */
502 tdb->ecode = TDB_ERR_CORRUPT;
503 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
505 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
507 if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
512 /* update a record tailer (must hold allocation lock) */
513 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
514 const struct list_struct *rec)
518 /* Offset of tailer from record header */
519 totalsize = sizeof(*rec) + rec->rec_len;
520 return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
524 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
526 struct list_struct rec;
527 tdb_off tailer_ofs, tailer;
529 if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
530 printf("ERROR: failed to read record at %u\n", offset);
534 printf(" rec: offset=0x%08x next=0x%08x rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
535 offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
537 tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
538 if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
539 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
543 if (tailer != rec.rec_len + sizeof(rec)) {
544 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
545 (unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
550 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
552 tdb_off rec_ptr, top;
554 top = TDB_HASH_TOP(i);
556 if (tdb_lock(tdb, i, F_WRLCK) != 0)
559 if (ofs_read(tdb, top, &rec_ptr) == -1)
560 return tdb_unlock(tdb, i, F_WRLCK);
563 printf("hash=%d\n", i);
566 rec_ptr = tdb_dump_record(tdb, rec_ptr);
569 return tdb_unlock(tdb, i, F_WRLCK);
572 void tdb_dump_all(TDB_CONTEXT *tdb)
575 for (i=0;i<tdb->header.hash_size;i++) {
576 tdb_dump_chain(tdb, i);
578 printf("freelist:\n");
579 tdb_dump_chain(tdb, -1);
582 int tdb_printfreelist(TDB_CONTEXT *tdb)
586 tdb_off offset, rec_ptr;
587 struct list_struct rec;
589 if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
592 offset = FREELIST_TOP;
594 /* read in the freelist top */
595 if (ofs_read(tdb, offset, &rec_ptr) == -1) {
596 tdb_unlock(tdb, -1, F_WRLCK);
600 printf("freelist top=[0x%08x]\n", rec_ptr );
602 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
603 tdb_unlock(tdb, -1, F_WRLCK);
607 if (rec.magic != TDB_FREE_MAGIC) {
608 printf("bad magic 0x%08x in free list\n", rec.magic);
609 tdb_unlock(tdb, -1, F_WRLCK);
613 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n",
614 rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
615 total_free += rec.rec_len;
617 /* move to the next record */
620 printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
623 return tdb_unlock(tdb, -1, F_WRLCK);
626 /* Remove an element from the freelist. Must have alloc lock. */
627 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
631 /* read in the freelist top */
632 last_ptr = FREELIST_TOP;
633 while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
635 /* We've found it! */
636 return ofs_write(tdb, last_ptr, &next);
638 /* Follow chain (next offset is at start of record) */
641 TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
642 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
645 /* Add an element into the freelist. Merge adjacent records if
647 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
651 /* Allocation and tailer lock */
652 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
655 /* set an initial tailer, so if we fail we don't leave a bogus record */
656 if (update_tailer(tdb, offset, rec) != 0) {
657 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
661 /* Look right first (I'm an Australian, dammit) */
662 right = offset + sizeof(*rec) + rec->rec_len;
663 if (right + sizeof(*rec) <= tdb->map_size) {
664 struct list_struct r;
666 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
667 TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
671 /* If it's free, expand to include it. */
672 if (r.magic == TDB_FREE_MAGIC) {
673 if (remove_from_freelist(tdb, right, r.next) == -1) {
674 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
677 rec->rec_len += sizeof(r) + r.rec_len;
683 left = offset - sizeof(tdb_off);
684 if (left > TDB_DATA_START(tdb->header.hash_size)) {
685 struct list_struct l;
688 /* Read in tailer and jump back to header */
689 if (ofs_read(tdb, left, &leftsize) == -1) {
690 TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
693 left = offset - leftsize;
695 /* Now read in record */
696 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
697 TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
701 /* If it's free, expand to include it. */
702 if (l.magic == TDB_FREE_MAGIC) {
703 if (remove_from_freelist(tdb, left, l.next) == -1) {
704 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
708 rec->rec_len += leftsize;
714 if (update_tailer(tdb, offset, rec) == -1) {
715 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
719 /* Now, prepend to free list */
720 rec->magic = TDB_FREE_MAGIC;
722 if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
723 rec_write(tdb, offset, rec) == -1 ||
724 ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
725 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
729 /* And we're done. */
730 tdb_unlock(tdb, -1, F_WRLCK);
734 tdb_unlock(tdb, -1, F_WRLCK);
739 /* expand a file. we prefer to use ftruncate, as that is what posix
740 says to use for mmap expansion */
741 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
744 #if HAVE_FTRUNCATE_EXTEND
745 if (ftruncate(tdb->fd, size+addition) != 0) {
746 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
747 size+addition, strerror(errno)));
754 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
756 if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
757 write(tdb->fd, &b, 1) != 1) {
759 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
760 size+addition, strerror(errno)));
765 /* now fill the file with something. This ensures that the file isn't sparse, which would be
766 very bad if we ran out of disk. This must be done with write, not via mmap */
767 memset(buf, 0x42, sizeof(buf));
769 int n = addition>sizeof(buf)?sizeof(buf):addition;
771 int ret = pwrite(tdb->fd, buf, n, size);
774 if (lseek(tdb->fd, size, SEEK_SET) != size)
776 ret = write(tdb->fd, buf, n);
779 TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
780 n, strerror(errno)));
790 /* expand the database at least size bytes by expanding the underlying
791 file and doing the mmap again if necessary */
792 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
794 struct list_struct rec;
797 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
798 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
802 /* must know about any previous expansions by another process */
803 tdb_oob(tdb, tdb->map_size + 1, 1);
805 /* always make room for at least 10 more records, and round
806 the database up to a multiple of TDB_PAGE_SIZE */
807 size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
809 if (!(tdb->flags & TDB_INTERNAL))
813 * We must ensure the file is unmapped before doing this
814 * to ensure consistency with systems like OpenBSD where
815 * writes and mmaps are not consistent.
818 /* expand the file itself */
819 if (!(tdb->flags & TDB_INTERNAL)) {
820 if (expand_file(tdb, tdb->map_size, size) != 0)
824 tdb->map_size += size;
826 if (tdb->flags & TDB_INTERNAL)
827 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
830 * We must ensure the file is remapped before adding the space
831 * to ensure consistency with systems like OpenBSD where
832 * writes and mmaps are not consistent.
835 /* We're ok if the mmap fails as we'll fallback to read/write */
839 /* form a new freelist record */
840 memset(&rec,'\0',sizeof(rec));
841 rec.rec_len = size - sizeof(rec);
843 /* link it into the free list */
844 offset = tdb->map_size - size;
845 if (tdb_free(tdb, offset, &rec) == -1)
848 tdb_unlock(tdb, -1, F_WRLCK);
851 tdb_unlock(tdb, -1, F_WRLCK);
857 the core of tdb_allocate - called when we have decided which
858 free list entry to use
860 static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
861 struct list_struct *rec, tdb_off last_ptr)
863 struct list_struct newrec;
866 memset(&newrec, '\0', sizeof(newrec));
868 /* found it - now possibly split it up */
869 if (rec->rec_len > length + MIN_REC_SIZE) {
870 /* Length of left piece */
871 length = TDB_ALIGN(length, TDB_ALIGNMENT);
873 /* Right piece to go on free list */
874 newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
875 newrec_ptr = rec_ptr + sizeof(*rec) + length;
877 /* And left record is shortened */
878 rec->rec_len = length;
883 /* Remove allocated record from the free list */
884 if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
888 /* Update header: do this before we drop alloc
889 lock, otherwise tdb_free() might try to
890 merge with us, thinking we're free.
891 (Thanks Jeremy Allison). */
892 rec->magic = TDB_MAGIC;
893 if (rec_write(tdb, rec_ptr, rec) == -1) {
897 /* Did we create new block? */
899 /* Update allocated record tailer (we
901 if (update_tailer(tdb, rec_ptr, rec) == -1) {
905 /* Free new record */
906 if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
911 /* all done - return the new record offset */
915 /* allocate some space from the free list. The offset returned points
916 to a unconnected list_struct within the database with room for at
917 least length bytes of total data
919 0 is returned if the space could not be allocated
921 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
922 struct list_struct *rec)
924 tdb_off rec_ptr, last_ptr, newrec_ptr;
926 tdb_off rec_ptr, last_ptr;
930 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
933 /* Extra bytes required for tailer */
934 length += sizeof(tdb_off);
937 last_ptr = FREELIST_TOP;
939 /* read in the freelist top */
940 if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
946 this is a best fit allocation strategy. Originally we used
947 a first fit strategy, but it suffered from massive fragmentation
948 issues when faced with a slowly increasing record size.
951 if (rec_free_read(tdb, rec_ptr, rec) == -1) {
955 if (rec->rec_len >= length) {
956 if (bestfit.rec_ptr == 0 ||
957 rec->rec_len < bestfit.rec_len) {
958 bestfit.rec_len = rec->rec_len;
959 bestfit.rec_ptr = rec_ptr;
960 bestfit.last_ptr = last_ptr;
961 /* consider a fit to be good enough if we aren't wasting more than half the space */
962 if (bestfit.rec_len < 2*length) {
968 /* move to the next record */
973 if (bestfit.rec_ptr != 0) {
974 if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
978 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
979 tdb_unlock(tdb, -1, F_WRLCK);
983 /* we didn't find enough space. See if we can expand the
984 database and if we can then try again */
985 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
988 tdb_unlock(tdb, -1, F_WRLCK);
992 /* initialise a new database with a specified hash size */
993 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
995 struct tdb_header *newdb;
998 /* We make it up in memory, then write it out if not internal */
999 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
1000 if (!(newdb = calloc(size, 1)))
1001 return TDB_ERRCODE(TDB_ERR_OOM, -1);
1003 /* Fill in the header */
1004 newdb->version = TDB_VERSION;
1005 newdb->hash_size = hash_size;
1006 #ifdef USE_SPINLOCKS
1007 newdb->rwlocks = size;
1009 if (tdb->flags & TDB_INTERNAL) {
1010 tdb->map_size = size;
1011 tdb->map_ptr = (char *)newdb;
1012 memcpy(&tdb->header, newdb, sizeof(tdb->header));
1013 /* Convert the `ondisk' version if asked. */
1017 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
1020 if (ftruncate(tdb->fd, 0) == -1)
1023 /* This creates an endian-converted header, as if read from disk */
1025 memcpy(&tdb->header, newdb, sizeof(tdb->header));
1026 /* Don't endian-convert the magic food! */
1027 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
1028 if (write(tdb->fd, newdb, size) != size)
1031 ret = tdb_create_rwlocks(tdb->fd, hash_size);
1038 /* Returns 0 on fail. On success, return offset of record, and fills
1040 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
1041 struct list_struct *r)
1045 /* read in the hash top */
1046 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
1049 /* keep looking until we find the right record */
1051 if (rec_read(tdb, rec_ptr, r) == -1)
1054 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
1056 /* a very likely hit - read the key */
1057 k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
1062 if (memcmp(key.dptr, k, key.dsize) == 0) {
1070 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1073 /* If they do lockkeys, check that this hash is one they locked */
1074 static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
1077 if (!tdb->lockedkeys)
1079 for (i = 0; i < tdb->lockedkeys[0]; i++)
1080 if (tdb->lockedkeys[i+1] == hash)
1082 return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
1085 /* As tdb_find, but if you succeed, keep the lock */
1086 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1087 struct list_struct *rec)
1091 if (!tdb_keylocked(tdb, hash))
1093 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1095 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1096 tdb_unlock(tdb, BUCKET(hash), locktype);
1100 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1105 static struct tdb_errname {
1106 enum TDB_ERROR ecode; const char *estring;
1107 } emap[] = { {TDB_SUCCESS, "Success"},
1108 {TDB_ERR_CORRUPT, "Corrupt database"},
1109 {TDB_ERR_IO, "IO Error"},
1110 {TDB_ERR_LOCK, "Locking error"},
1111 {TDB_ERR_OOM, "Out of memory"},
1112 {TDB_ERR_EXISTS, "Record exists"},
1113 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1114 {TDB_ERR_NOEXIST, "Record does not exist"} };
1116 /* Error string for the last tdb error */
1117 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1120 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1121 if (tdb->ecode == emap[i].ecode)
1122 return emap[i].estring;
1123 return "Invalid error code";
1126 /* update an entry in place - this only works if the new data size
1127 is <= the old data size and the key exists.
1128 on failure return -1.
1131 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1133 struct list_struct rec;
1137 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1140 /* must be long enough key, data and tailer */
1141 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1142 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1146 if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1147 dbuf.dptr, dbuf.dsize) == -1)
1150 if (dbuf.dsize != rec.data_len) {
1152 rec.data_len = dbuf.dsize;
1153 return rec_write(tdb, rec_ptr, &rec);
1159 /* find an entry in the database given a key */
1160 /* If an entry doesn't exist tdb_err will be set to
1161 * TDB_ERR_NOEXIST. If a key has no data attached
1162 * tdb_err will not be set. Both will return a
1163 * zero pptr and zero dsize.
1166 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1169 struct list_struct rec;
1173 /* find which hash bucket it is in */
1174 hash = tdb_hash(&key);
1175 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1179 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1183 ret.dsize = rec.data_len;
1184 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1188 /* check if an entry in the database exists
1190 note that 1 is returned if the key is found and 0 is returned if not found
1191 this doesn't match the conventions in the rest of this module, but is
1192 compatible with gdbm
1194 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1196 struct list_struct rec;
1198 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1200 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1204 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1206 u32 hash = tdb_hash(&key);
1207 return tdb_exists_hash(tdb, key, hash);
1210 /* record lock stops delete underneath */
1211 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1213 return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1216 Write locks override our own fcntl readlocks, so check it here.
1217 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1218 an error to fail to get the lock here.
1221 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1223 struct tdb_traverse_lock *i;
1224 for (i = &tdb->travlocks; i; i = i->next)
1227 return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1231 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1232 an error to fail to get the lock here.
1235 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1237 return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1239 /* fcntl locks don't stack: avoid unlocking someone else's */
1240 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1242 struct tdb_traverse_lock *i;
1247 for (i = &tdb->travlocks; i; i = i->next)
1250 return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1253 /* actually delete an entry in the database given the offset */
1254 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1256 tdb_off last_ptr, i;
1257 struct list_struct lastrec;
1259 if (tdb->read_only) return -1;
1261 if (write_lock_record(tdb, rec_ptr) == -1) {
1262 /* Someone traversing here: mark it as dead */
1263 rec->magic = TDB_DEAD_MAGIC;
1264 return rec_write(tdb, rec_ptr, rec);
1266 if (write_unlock_record(tdb, rec_ptr) != 0)
1269 /* find previous record in hash chain */
1270 if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1272 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1273 if (rec_read(tdb, i, &lastrec) == -1)
1276 /* unlink it: next ptr is at start of record. */
1278 last_ptr = TDB_HASH_TOP(rec->full_hash);
1279 if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1282 /* recover the space */
1283 if (tdb_free(tdb, rec_ptr, rec) == -1)
1288 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1289 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1290 struct list_struct *rec)
1292 int want_next = (tlock->off != 0);
1294 /* No traversal allows if you've called tdb_lockkeys() */
1295 if (tdb->lockedkeys)
1296 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1298 /* Lock each chain from the start one. */
1299 for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1300 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1303 /* No previous record? Start at top of chain. */
1305 if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1309 /* Otherwise unlock the previous record. */
1310 if (unlock_record(tdb, tlock->off) != 0)
1315 /* We have offset of old record: grab next */
1316 if (rec_read(tdb, tlock->off, rec) == -1)
1318 tlock->off = rec->next;
1321 /* Iterate through chain */
1322 while( tlock->off) {
1324 if (rec_read(tdb, tlock->off, rec) == -1)
1326 if (!TDB_DEAD(rec)) {
1327 /* Woohoo: we found one! */
1328 if (lock_record(tdb, tlock->off) != 0)
1332 /* Try to clean dead ones from old traverses */
1333 current = tlock->off;
1334 tlock->off = rec->next;
1335 if (!tdb->read_only &&
1336 do_delete(tdb, current, rec) != 0)
1339 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1342 /* We finished iteration without finding anything */
1343 return TDB_ERRCODE(TDB_SUCCESS, 0);
1347 if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1348 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1352 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1353 return -1 on error or the record count traversed
1354 if fn is NULL then it is not called
1355 a non-zero return value from fn() indicates that the traversal should stop
1357 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1360 struct list_struct rec;
1361 struct tdb_traverse_lock tl = { NULL, 0, 0 };
1364 /* This was in the initializaton, above, but the IRIX compiler
1365 * did not like it. crh
1367 tl.next = tdb->travlocks.next;
1369 /* fcntl locks don't stack: beware traverse inside traverse */
1370 tdb->travlocks.next = &tl;
1372 /* tdb_next_lock places locks on the record returned, and its chain */
1373 while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1375 /* now read the full record */
1376 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1377 rec.key_len + rec.data_len);
1380 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1382 if (unlock_record(tdb, tl.off) != 0)
1383 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1386 key.dsize = rec.key_len;
1387 dbuf.dptr = key.dptr + rec.key_len;
1388 dbuf.dsize = rec.data_len;
1390 /* Drop chain lock, call out */
1391 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1395 if (fn && fn(tdb, key, dbuf, private)) {
1396 /* They want us to terminate traversal */
1398 if (unlock_record(tdb, tl.off) != 0) {
1399 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1402 tdb->travlocks.next = tl.next;
1403 SAFE_FREE(key.dptr);
1406 SAFE_FREE(key.dptr);
1409 tdb->travlocks.next = tl.next;
1416 /* find the first entry in the database and return its key */
1417 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1420 struct list_struct rec;
1422 /* release any old lock */
1423 if (unlock_record(tdb, tdb->travlocks.off) != 0)
1425 tdb->travlocks.off = tdb->travlocks.hash = 0;
1427 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1429 /* now read the key */
1430 key.dsize = rec.key_len;
1431 key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1432 if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1433 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1437 /* find the next entry in the database, returning its key */
1438 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1441 TDB_DATA key = tdb_null;
1442 struct list_struct rec;
1445 /* Is locked key the old key? If so, traverse will be reliable. */
1446 if (tdb->travlocks.off) {
1447 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1449 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1450 || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1452 || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1453 /* No, it wasn't: unlock it and start from scratch */
1454 if (unlock_record(tdb, tdb->travlocks.off) != 0)
1456 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1458 tdb->travlocks.off = 0;
1464 if (!tdb->travlocks.off) {
1465 /* No previous element: do normal find, and lock record */
1466 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb_hash(&oldkey), F_WRLCK, &rec);
1467 if (!tdb->travlocks.off)
1469 tdb->travlocks.hash = BUCKET(rec.full_hash);
1470 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1471 TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1475 oldhash = tdb->travlocks.hash;
1477 /* Grab next record: locks chain and returned record,
1478 unlocks old record */
1479 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1480 key.dsize = rec.key_len;
1481 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1483 /* Unlock the chain of this new record */
1484 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1485 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1487 /* Unlock the chain of old record */
1488 if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1489 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1493 /* delete an entry in the database given a key */
1494 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1497 struct list_struct rec;
1500 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1502 ret = do_delete(tdb, rec_ptr, &rec);
1503 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1504 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1508 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1510 u32 hash = tdb_hash(&key);
1511 return tdb_delete_hash(tdb, key, hash);
1514 /* store an element in the database, replacing any existing element
1517 return 0 on success, -1 on failure
1519 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1521 struct list_struct rec;
1527 /* find which hash bucket it is in */
1528 hash = tdb_hash(&key);
1529 if (!tdb_keylocked(tdb, hash))
1531 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1534 /* check for it existing, on insert. */
1535 if (flag == TDB_INSERT) {
1536 if (tdb_exists_hash(tdb, key, hash)) {
1537 tdb->ecode = TDB_ERR_EXISTS;
1541 /* first try in-place update, on modify or replace. */
1542 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1544 if (tdb->ecode == TDB_ERR_NOEXIST &&
1545 flag == TDB_MODIFY) {
1546 /* if the record doesn't exist and we are in TDB_MODIFY mode then
1547 we should fail the store */
1551 /* reset the error code potentially set by the tdb_update() */
1552 tdb->ecode = TDB_SUCCESS;
1554 /* delete any existing record - if it doesn't exist we don't
1555 care. Doing this first reduces fragmentation, and avoids
1556 coalescing with `allocated' block before it's updated. */
1557 if (flag != TDB_INSERT)
1558 tdb_delete_hash(tdb, key, hash);
1560 /* Copy key+value *before* allocating free space in case malloc
1561 fails and we are left with a dead spot in the tdb. */
1563 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1564 tdb->ecode = TDB_ERR_OOM;
1568 memcpy(p, key.dptr, key.dsize);
1570 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1572 /* we have to allocate some space */
1573 if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1576 /* Read hash top into next ptr */
1577 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1580 rec.key_len = key.dsize;
1581 rec.data_len = dbuf.dsize;
1582 rec.full_hash = hash;
1583 rec.magic = TDB_MAGIC;
1585 /* write out and point the top of the hash chain at it */
1586 if (rec_write(tdb, rec_ptr, &rec) == -1
1587 || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1588 || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1589 /* Need to tdb_unallocate() here */
1594 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1601 /* Attempt to append data to an entry in place - this only works if the new data size
1602 is <= the old data size and the key exists.
1603 on failure return -1. Record must be locked before calling.
1605 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1607 struct list_struct rec;
1611 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1614 /* Append of 0 is always ok. */
1615 if (new_dbuf.dsize == 0)
1618 /* must be long enough for key, old data + new data and tailer */
1619 if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1621 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1625 if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1626 new_dbuf.dptr, new_dbuf.dsize) == -1)
1630 rec.data_len += new_dbuf.dsize;
1631 return rec_write(tdb, rec_ptr, &rec);
1634 /* Append to an entry. Create if not exist. */
1636 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1638 struct list_struct rec;
1643 size_t new_data_size = 0;
1645 /* find which hash bucket it is in */
1646 hash = tdb_hash(&key);
1647 if (!tdb_keylocked(tdb, hash))
1649 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1652 /* first try in-place. */
1653 if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1656 /* reset the error code potentially set by the tdb_append_inplace() */
1657 tdb->ecode = TDB_SUCCESS;
1660 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1661 if (tdb->ecode != TDB_ERR_NOEXIST)
1664 /* Not found - create. */
1666 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1670 new_data_size = rec.data_len + new_dbuf.dsize;
1672 /* Copy key+old_value+value *before* allocating free space in case malloc
1673 fails and we are left with a dead spot in the tdb. */
1675 if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1676 tdb->ecode = TDB_ERR_OOM;
1680 /* Copy the key in place. */
1681 memcpy(p, key.dptr, key.dsize);
1683 /* Now read the old data into place. */
1685 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1688 /* Finally append the new data. */
1690 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1692 /* delete any existing record - if it doesn't exist we don't
1693 care. Doing this first reduces fragmentation, and avoids
1694 coalescing with `allocated' block before it's updated. */
1696 tdb_delete_hash(tdb, key, hash);
1698 if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1701 /* Read hash top into next ptr */
1702 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1705 rec.key_len = key.dsize;
1706 rec.data_len = new_data_size;
1707 rec.full_hash = hash;
1708 rec.magic = TDB_MAGIC;
1710 /* write out and point the top of the hash chain at it */
1711 if (rec_write(tdb, rec_ptr, &rec) == -1
1712 || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1713 || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1714 /* Need to tdb_unallocate() here */
1720 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1728 static int tdb_already_open(dev_t device,
1733 for (i = tdbs; i; i = i->next) {
1734 if (i->device == device && i->inode == ino) {
1742 /* open the database, creating it if necessary
1744 The open_flags and mode are passed straight to the open call on the
1745 database file. A flags value of O_WRONLY is invalid. The hash size
1746 is advisory, use zero for a default value.
1748 Return is NULL on error, in which case errno is also set. Don't
1749 try to call tdb_error or tdb_errname, just do strerror(errno).
1751 @param name may be NULL for internal databases. */
1752 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1753 int open_flags, mode_t mode)
1755 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1759 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1760 int open_flags, mode_t mode,
1761 tdb_log_func log_fn)
1765 int rev = 0, locked = 0;
1769 if (!(tdb = calloc(1, sizeof *tdb))) {
1770 /* Can't log this */
1776 tdb->map_ptr = NULL;
1777 tdb->lockedkeys = NULL;
1778 tdb->flags = tdb_flags;
1779 tdb->open_flags = open_flags;
1780 tdb->log_fn = log_fn;
1782 if ((open_flags & O_ACCMODE) == O_WRONLY) {
1783 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1790 hash_size = DEFAULT_HASH_SIZE;
1791 if ((open_flags & O_ACCMODE) == O_RDONLY) {
1793 /* read only databases don't do locking or clear if first */
1794 tdb->flags |= TDB_NOLOCK;
1795 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1798 /* internal databases don't mmap or lock, and start off cleared */
1799 if (tdb->flags & TDB_INTERNAL) {
1800 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1801 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1802 if (tdb_new_database(tdb, hash_size) != 0) {
1803 TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1809 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1810 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1811 name, strerror(errno)));
1812 goto fail; /* errno set by open(2) */
1815 /* ensure there is only one process initialising at once */
1816 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1817 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1818 name, strerror(errno)));
1819 goto fail; /* errno set by tdb_brlock */
1822 /* we need to zero database if we are the only one with it open */
1823 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1824 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1825 open_flags |= O_CREAT;
1826 if (ftruncate(tdb->fd, 0) == -1) {
1827 TDB_LOG((tdb, 0, "tdb_open_ex: "
1828 "failed to truncate %s: %s\n",
1829 name, strerror(errno)));
1830 goto fail; /* errno set by ftruncate */
1834 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1835 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1836 || (tdb->header.version != TDB_VERSION
1837 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1838 /* its not a valid database - possibly initialise it */
1839 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1840 errno = EIO; /* ie bad format or something */
1843 rev = (tdb->flags & TDB_CONVERT);
1845 vp = (uint8_t *)&tdb->header.version;
1846 vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1847 (((u32)vp[2]) << 8) | (u32)vp[3];
1848 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1850 tdb->flags &= ~TDB_CONVERT;
1852 tdb->flags |= TDB_CONVERT;
1853 convert(&tdb->header, sizeof(tdb->header));
1855 if (fstat(tdb->fd, &st) == -1)
1858 /* Is it already in the open list? If so, fail. */
1859 if (tdb_already_open(st.st_dev, st.st_ino)) {
1860 TDB_LOG((tdb, 2, "tdb_open_ex: "
1861 "%s (%d,%d) is already open in this process\n",
1862 name, st.st_dev, st.st_ino));
1867 if (!(tdb->name = (char *)strdup(name))) {
1872 tdb->map_size = st.st_size;
1873 tdb->device = st.st_dev;
1874 tdb->inode = st.st_ino;
1875 tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1877 TDB_LOG((tdb, 2, "tdb_open_ex: "
1878 "failed to allocate lock structure for %s\n",
1885 if (!tdb->read_only)
1886 if (tdb_clear_spinlocks(tdb) != 0) {
1887 TDB_LOG((tdb, 0, "tdb_open_ex: "
1888 "failed to clear spinlock\n"));
1891 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1892 TDB_LOG((tdb, 0, "tdb_open_ex: "
1893 "failed to take ACTIVE_LOCK on %s: %s\n",
1894 name, strerror(errno)));
1900 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1901 we didn't get the initial exclusive lock as we need to let all other
1902 users know we're using it. */
1904 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1905 /* leave this lock in place to indicate it's in use */
1906 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1912 /* Internal (memory-only) databases skip all the code above to
1913 * do with disk files, and resume here by releasing their
1914 * global lock and hooking into the active list. */
1915 if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1922 { int save_errno = errno;
1928 if (tdb->flags & TDB_INTERNAL)
1929 SAFE_FREE(tdb->map_ptr);
1933 SAFE_FREE(tdb->name);
1935 if (close(tdb->fd) != 0)
1936 TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1937 SAFE_FREE(tdb->locked);
1947 * @returns -1 for error; 0 for success.
1949 int tdb_close(TDB_CONTEXT *tdb)
1955 if (tdb->flags & TDB_INTERNAL)
1956 SAFE_FREE(tdb->map_ptr);
1960 SAFE_FREE(tdb->name);
1962 ret = close(tdb->fd);
1963 SAFE_FREE(tdb->locked);
1964 SAFE_FREE(tdb->lockedkeys);
1966 /* Remove from contexts list */
1967 for (i = &tdbs; *i; i = &(*i)->next) {
1974 memset(tdb, 0, sizeof(*tdb));
1980 /* lock/unlock entire database */
1981 int tdb_lockall(TDB_CONTEXT *tdb)
1985 /* There are no locks on read-only dbs */
1987 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1988 if (tdb->lockedkeys)
1989 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1990 for (i = 0; i < tdb->header.hash_size; i++)
1991 if (tdb_lock(tdb, i, F_WRLCK))
1994 /* If error, release locks we have... */
1995 if (i < tdb->header.hash_size) {
1998 for ( j = 0; j < i; j++)
1999 tdb_unlock(tdb, j, F_WRLCK);
2000 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
2005 void tdb_unlockall(TDB_CONTEXT *tdb)
2008 for (i=0; i < tdb->header.hash_size; i++)
2009 tdb_unlock(tdb, i, F_WRLCK);
2012 int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
2016 /* Can't lock more keys if already locked */
2017 if (tdb->lockedkeys)
2018 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
2019 if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
2020 return TDB_ERRCODE(TDB_ERR_OOM, -1);
2021 /* First number in array is # keys */
2022 tdb->lockedkeys[0] = number;
2024 /* Insertion sort by bucket */
2025 for (i = 0; i < number; i++) {
2026 hash = tdb_hash(&keys[i]);
2027 for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
2028 memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
2029 tdb->lockedkeys[j+1] = hash;
2031 /* Finally, lock in order */
2032 for (i = 0; i < number; i++)
2033 if (tdb_lock(tdb, i, F_WRLCK))
2036 /* If error, release locks we have... */
2038 for ( j = 0; j < i; j++)
2039 tdb_unlock(tdb, j, F_WRLCK);
2040 SAFE_FREE(tdb->lockedkeys);
2041 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
2046 /* Unlock the keys previously locked by tdb_lockkeys() */
2047 void tdb_unlockkeys(TDB_CONTEXT *tdb)
2050 if (!tdb->lockedkeys)
2052 for (i = 0; i < tdb->lockedkeys[0]; i++)
2053 tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
2054 SAFE_FREE(tdb->lockedkeys);
2057 /* lock/unlock one hash chain. This is meant to be used to reduce
2058 contention - it cannot guarantee how many records will be locked */
2059 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
2061 return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
2064 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
2066 return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
2069 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2071 return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
2074 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2076 return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
2080 /* register a loging function */
2081 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
2087 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
2088 seek pointer from our parent and to re-establish locks */
2089 int tdb_reopen(TDB_CONTEXT *tdb)
2093 if (tdb->flags & TDB_INTERNAL)
2094 return 0; /* Nothing to do. */
2095 if (tdb_munmap(tdb) != 0) {
2096 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2099 if (close(tdb->fd) != 0)
2100 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2101 tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2102 if (tdb->fd == -1) {
2103 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2106 if (fstat(tdb->fd, &st) != 0) {
2107 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2110 if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2111 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2115 if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
2116 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2127 /* reopen all tdb's */
2128 int tdb_reopen_all(void)
2132 for (tdb=tdbs; tdb; tdb = tdb->next) {
2133 /* Ensure no clear-if-first. */
2134 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2135 if (tdb_reopen(tdb) != 0)