2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
29 #include "tdb_private.h"
32 * We prepend the mutex area, so fixup offsets. See mutex.c for details.
33 * tdb->hdr_ofs is 0 or header.mutex_size.
35 * Note: that we only have the 4GB limit of tdb_off_t for
36 * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
39 static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
41 off_t tmp = tdb->hdr_ofs + *off;
43 if ((tmp < tdb->hdr_ofs) || (tmp < *off)) {
52 static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
53 size_t count, off_t offset)
55 if (!tdb_adjust_offset(tdb, &offset)) {
58 return pwrite(tdb->fd, buf, count, offset);
61 static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
62 size_t count, off_t offset)
64 if (!tdb_adjust_offset(tdb, &offset)) {
67 return pread(tdb->fd, buf, count, offset);
70 static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
72 if (!tdb_adjust_offset(tdb, &length)) {
75 return ftruncate(tdb->fd, length);
78 static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
82 ret = fstat(tdb->fd, buf);
87 if (buf->st_size < tdb->hdr_ofs) {
91 buf->st_size -= tdb->hdr_ofs;
96 /* check for an out of bounds access - if it is out of bounds then
97 see if the database has been expanded by someone else and expand
100 static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len,
104 if (len + off < len) {
106 /* Ensure ecode is set for log fn. */
107 tdb->ecode = TDB_ERR_IO;
108 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %u len %u wrap\n",
114 if (off + len <= tdb->map_size)
116 if (tdb->flags & TDB_INTERNAL) {
118 /* Ensure ecode is set for log fn. */
119 tdb->ecode = TDB_ERR_IO;
120 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond internal malloc size %u\n",
121 (int)(off + len), (int)tdb->map_size));
126 if (tdb_fstat(tdb, &st) == -1) {
127 tdb->ecode = TDB_ERR_IO;
131 /* Beware >4G files! */
132 if ((tdb_off_t)st.st_size != st.st_size) {
133 /* Ensure ecode is set for log fn. */
134 tdb->ecode = TDB_ERR_IO;
135 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_oob len %llu too large!\n",
136 (long long)st.st_size));
140 /* Unmap, update size, remap. We do this unconditionally, to handle
141 * the unusual case where the db is truncated.
143 * This can happen to a child using tdb_reopen_all(true) on a
144 * TDB_CLEAR_IF_FIRST tdb whose parent crashes: the next
145 * opener will truncate the database. */
146 if (tdb_munmap(tdb) == -1) {
147 tdb->ecode = TDB_ERR_IO;
150 tdb->map_size = st.st_size;
151 if (tdb_mmap(tdb) != 0) {
155 if (st.st_size < (size_t)off + len) {
157 /* Ensure ecode is set for log fn. */
158 tdb->ecode = TDB_ERR_IO;
159 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n",
160 (int)(off + len), (int)st.st_size));
167 /* write a lump of data at a specified offset */
168 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
169 const void *buf, tdb_len_t len)
175 if (tdb->read_only || tdb->traverse_read) {
176 tdb->ecode = TDB_ERR_RDONLY;
180 if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0)
184 memcpy(off + (char *)tdb->map_ptr, buf, len);
186 #ifdef HAVE_INCOHERENT_MMAP
187 tdb->ecode = TDB_ERR_IO;
192 written = tdb_pwrite(tdb, buf, len, off);
194 if ((written != (ssize_t)len) && (written != -1)) {
196 tdb->ecode = TDB_ERR_IO;
197 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
198 "%zi of %u bytes at %u, trying once more\n",
200 written = tdb_pwrite(tdb, (const char *)buf+written,
201 len-written, off+written);
204 /* Ensure ecode is set for log fn. */
205 tdb->ecode = TDB_ERR_IO;
206 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %u "
207 "len=%u (%s)\n", off, len, strerror(errno)));
209 } else if (written != (ssize_t)len) {
210 tdb->ecode = TDB_ERR_IO;
211 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
212 "write %u bytes at %u in two attempts\n",
221 /* Endian conversion: we only ever deal with 4 byte quantities */
222 void *tdb_convert(void *buf, uint32_t size)
224 uint32_t i, *p = (uint32_t *)buf;
225 for (i = 0; i < size / 4; i++)
226 p[i] = TDB_BYTEREV(p[i]);
231 /* read a lump of data at a specified offset, maybe convert */
232 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
233 tdb_len_t len, int cv)
235 if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0) {
240 memcpy(buf, off + (char *)tdb->map_ptr, len);
242 #ifdef HAVE_INCOHERENT_MMAP
243 tdb->ecode = TDB_ERR_IO;
248 ret = tdb_pread(tdb, buf, len, off);
249 if (ret != (ssize_t)len) {
250 /* Ensure ecode is set for log fn. */
251 tdb->ecode = TDB_ERR_IO;
252 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %u "
253 "len=%u ret=%zi (%s) map_size=%u\n",
254 off, len, ret, strerror(errno),
261 tdb_convert(buf, len);
269 do an unlocked scan of the hash table heads to find the next non-zero head. The value
270 will then be confirmed with the lock held
272 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
276 for (;h < tdb->hash_size;h++) {
277 if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
283 for (;h < tdb->hash_size;h++) {
284 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
293 int tdb_munmap(struct tdb_context *tdb)
295 if (tdb->flags & TDB_INTERNAL)
302 ret = munmap(tdb->map_ptr, tdb->map_size);
311 /* If mmap isn't coherent, *everyone* must always mmap. */
312 static bool should_mmap(const struct tdb_context *tdb)
314 #ifdef HAVE_INCOHERENT_MMAP
317 return !(tdb->flags & TDB_NOMMAP);
321 int tdb_mmap(struct tdb_context *tdb)
323 if (tdb->flags & TDB_INTERNAL)
327 if (should_mmap(tdb)) {
328 tdb->map_ptr = mmap(NULL, tdb->map_size,
329 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
330 MAP_SHARED|MAP_FILE, tdb->fd,
334 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
337 if (tdb->map_ptr == MAP_FAILED) {
339 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %u (%s)\n",
340 tdb->map_size, strerror(errno)));
341 #ifdef HAVE_INCOHERENT_MMAP
342 tdb->ecode = TDB_ERR_IO;
355 /* expand a file. we prefer to use ftruncate, as that is what posix
356 says to use for mmap expansion */
357 static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
362 if (tdb->read_only || tdb->traverse_read) {
363 tdb->ecode = TDB_ERR_RDONLY;
367 if (!tdb_add_off_t(size, addition, &new_size)) {
368 tdb->ecode = TDB_ERR_OOM;
369 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
370 "overflow detected current size[%u] addition[%u]!\n",
371 (unsigned)size, (unsigned)addition));
376 if (tdb_ftruncate(tdb, new_size) == -1) {
378 ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1);
380 /* try once more, potentially revealing errno */
381 written = tdb_pwrite(tdb, &b, 1, new_size - 1);
384 /* again - give up, guessing errno */
388 tdb->ecode = TDB_ERR_OOM;
389 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %u failed (%s)\n",
390 (unsigned)new_size, strerror(errno)));
395 /* now fill the file with something. This ensures that the
396 file isn't sparse, which would be very bad if we ran out of
397 disk. This must be done with write, not via mmap */
398 memset(buf, TDB_PAD_BYTE, sizeof(buf));
400 size_t n = addition>sizeof(buf)?sizeof(buf):addition;
401 ssize_t written = tdb_pwrite(tdb, buf, n, size);
403 /* prevent infinite loops: try _once_ more */
404 written = tdb_pwrite(tdb, buf, n, size);
407 /* give up, trying to provide a useful errno */
408 tdb->ecode = TDB_ERR_OOM;
409 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
410 "returned 0 twice: giving up!\n"));
415 tdb->ecode = TDB_ERR_OOM;
416 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
417 "%u bytes failed (%s)\n", (int)n,
422 TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
423 "only %zu of %zi bytes - retrying\n", written,
433 /* You need 'size', this tells you how much you should expand by. */
434 tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
436 tdb_off_t new_size, top_size, increment;
437 tdb_off_t max_size = UINT32_MAX - map_size;
439 if (size > max_size) {
441 * We can't round up anymore, just give back
442 * what we're asked for.
444 * The caller has to take care of the ENOSPC handling.
449 /* limit size in order to avoid using up huge amounts of memory for
450 * in memory tdbs if an oddball huge record creeps in */
451 if (size > 100 * 1024) {
452 increment = size * 2;
454 increment = size * 100;
456 if (increment < size) {
460 if (!tdb_add_off_t(map_size, increment, &top_size)) {
464 /* always make room for at least top_size more records, and at
465 least 25% more space. if the DB is smaller than 100MiB,
466 otherwise grow it by 10% only. */
467 if (map_size > 100 * 1024 * 1024) {
468 new_size = map_size * 1.10;
470 new_size = map_size * 1.25;
472 if (new_size < map_size) {
476 /* Round the database up to a multiple of the page size */
477 new_size = MAX(top_size, new_size);
479 if (new_size + page_size < new_size) {
480 /* There's a "+" in TDB_ALIGN that might overflow... */
484 return TDB_ALIGN(new_size, page_size) - map_size;
488 * Somewhere in between we went over 4GB. Make one big jump to
489 * exactly 4GB database size.
494 /* expand the database at least size bytes by expanding the underlying
495 file and doing the mmap again if necessary */
496 int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
498 struct tdb_record rec;
502 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
503 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
507 /* must know about any previous expansions by another process */
508 tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
511 * Note: that we don't care about tdb->hdr_ofs != 0 here
513 * The 4GB limitation is just related to tdb->map_size
514 * and the offset calculation in the records.
516 * The file on disk can be up to 4GB + tdb->hdr_ofs
518 size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
520 if (!tdb_add_off_t(tdb->map_size, size, &new_size)) {
521 tdb->ecode = TDB_ERR_OOM;
522 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_expand "
523 "overflow detected current map_size[%u] size[%u]!\n",
524 (unsigned)tdb->map_size, (unsigned)size));
528 /* form a new freelist record */
529 offset = tdb->map_size;
530 memset(&rec,'\0',sizeof(rec));
531 rec.rec_len = size - sizeof(rec);
533 if (tdb->flags & TDB_INTERNAL) {
536 new_map_ptr = (char *)realloc(tdb->map_ptr, new_size);
538 tdb->ecode = TDB_ERR_OOM;
541 tdb->map_ptr = new_map_ptr;
542 tdb->map_size = new_size;
547 * expand the file itself
549 ret = tdb->methods->tdb_expand_file(tdb, tdb->map_size, size);
554 /* Explicitly remap: if we're in a transaction, this won't
555 * happen automatically! */
557 tdb->map_size = new_size;
558 if (tdb_mmap(tdb) != 0) {
563 /* link it into the free list */
564 if (tdb_free(tdb, offset, &rec) == -1)
567 tdb_unlock(tdb, -1, F_WRLCK);
570 tdb_unlock(tdb, -1, F_WRLCK);
574 /* read/write a tdb_off_t */
575 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
577 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
580 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
583 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
587 /* read a lump of data, allocating the space for it */
588 unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
592 /* some systems don't like zero length malloc */
594 if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
595 /* Ensure ecode is set for log fn. */
596 tdb->ecode = TDB_ERR_OOM;
597 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%u (%s)\n",
598 len, strerror(errno)));
601 if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
608 /* Give a piece of tdb data to a parser */
610 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
611 tdb_off_t offset, tdb_len_t len,
612 int (*parser)(TDB_DATA key, TDB_DATA data,
621 if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
623 * Optimize by avoiding the malloc/memcpy/free, point the
624 * parser directly at the mmap area.
626 if (tdb->methods->tdb_oob(tdb, offset, len, 0) != 0) {
629 data.dptr = offset + (unsigned char *)tdb->map_ptr;
630 return parser(key, data, private_data);
633 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
637 result = parser(key, data, private_data);
642 /* read/write a record */
643 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
645 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
647 if (TDB_BAD_MAGIC(rec)) {
648 /* Ensure ecode is set for log fn. */
649 tdb->ecode = TDB_ERR_CORRUPT;
650 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%u\n", rec->magic, offset));
653 return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0);
656 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
658 struct tdb_record r = *rec;
659 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
662 static const struct tdb_methods io_methods = {
671 initialise the default methods table
673 void tdb_io_init(struct tdb_context *tdb)
675 tdb->methods = &io_methods;