#include "tdb_private.h"
+/*
+ * We prepend the mutex area, so fixup offsets. See mutex.c for details.
+ * tdb->hdr_ofs is 0 or header.mutex_size.
+ *
+ * Note: that we only have the 4GB limit of tdb_off_t for
+ * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
+ */
+
+static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
+{
+ off_t tmp = tdb->hdr_ofs + *off;
+
+ if ((tmp < tdb->hdr_ofs) || (tmp < *off)) {
+ errno = EIO;
+ return false;
+ }
+
+ *off = tmp;
+ return true;
+}
+
+static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
+ size_t count, off_t offset)
+{
+ ssize_t ret;
+
+ if (!tdb_adjust_offset(tdb, &offset)) {
+ return -1;
+ }
+
+ do {
+ ret = pwrite(tdb->fd, buf, count, offset);
+ } while ((ret == -1) && (errno == EINTR));
+
+ return ret;
+}
+
+static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
+ size_t count, off_t offset)
+{
+ ssize_t ret;
+
+ if (!tdb_adjust_offset(tdb, &offset)) {
+ return -1;
+ }
+
+ do {
+ ret = pread(tdb->fd, buf, count, offset);
+ } while ((ret == -1) && (errno == EINTR));
+
+ return ret;
+}
+
+static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
+{
+ ssize_t ret;
+
+ if (!tdb_adjust_offset(tdb, &length)) {
+ return -1;
+ }
+
+ do {
+ ret = ftruncate(tdb->fd, length);
+ } while ((ret == -1) && (errno == EINTR));
+
+ return ret;
+}
+
+#if HAVE_POSIX_FALLOCATE
+static int tdb_posix_fallocate(struct tdb_context *tdb, off_t offset,
+ off_t len)
+{
+ ssize_t ret;
+
+ if (!tdb_adjust_offset(tdb, &offset)) {
+ return -1;
+ }
+
+ do {
+ ret = posix_fallocate(tdb->fd, offset, len);
+ } while ((ret == -1) && (errno == EINTR));
+
+ return ret;
+}
+#endif
+
+static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
+{
+ int ret;
+
+ ret = fstat(tdb->fd, buf);
+ if (ret == -1) {
+ return -1;
+ }
+
+ if (buf->st_size < tdb->hdr_ofs) {
+ errno = EIO;
+ return -1;
+ }
+ buf->st_size -= tdb->hdr_ofs;
+
+ return ret;
+}
+
/* check for an out of bounds access - if it is out of bounds then
see if the database has been expanded by someone else and expand
if necessary
if (!probe) {
/* Ensure ecode is set for log fn. */
tdb->ecode = TDB_ERR_IO;
- TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %d len %d wrap\n",
- (int)off, (int)len));
+ TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %u len %u wrap\n",
+ off, len));
}
return -1;
}
return -1;
}
- if (fstat(tdb->fd, &st) == -1) {
+ if (tdb_fstat(tdb, &st) == -1) {
tdb->ecode = TDB_ERR_IO;
return -1;
}
tdb->ecode = TDB_ERR_IO;
return -1;
#else
- ssize_t written = pwrite(tdb->fd, buf, len, off);
+ ssize_t written;
+
+ written = tdb_pwrite(tdb, buf, len, off);
+
if ((written != (ssize_t)len) && (written != -1)) {
/* try once more */
tdb->ecode = TDB_ERR_IO;
TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
- "%d of %d bytes at %d, trying once more\n",
- (int)written, len, off));
- written = pwrite(tdb->fd, (const char *)buf+written,
- len-written,
- off+written);
+ "%zi of %u bytes at %u, trying once more\n",
+ written, len, off));
+ written = tdb_pwrite(tdb, (const char *)buf+written,
+ len-written, off+written);
}
if (written == -1) {
/* Ensure ecode is set for log fn. */
tdb->ecode = TDB_ERR_IO;
- TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
- "len=%d (%s)\n", off, len, strerror(errno)));
+ TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %u "
+ "len=%u (%s)\n", off, len, strerror(errno)));
return -1;
} else if (written != (ssize_t)len) {
tdb->ecode = TDB_ERR_IO;
TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
- "write %d bytes at %d in two attempts\n",
+ "write %u bytes at %u in two attempts\n",
len, off));
return -1;
}
tdb->ecode = TDB_ERR_IO;
return -1;
#else
- ssize_t ret = pread(tdb->fd, buf, len, off);
+ ssize_t ret;
+
+ ret = tdb_pread(tdb, buf, len, off);
if (ret != (ssize_t)len) {
/* Ensure ecode is set for log fn. */
tdb->ecode = TDB_ERR_IO;
- TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d "
- "len=%d ret=%d (%s) map_size=%d\n",
- (int)off, (int)len, (int)ret, strerror(errno),
- (int)tdb->map_size));
+ TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %u "
+ "len=%u ret=%zi (%s) map_size=%u\n",
+ off, len, ret, strerror(errno),
+ tdb->map_size));
return -1;
}
#endif
if (should_mmap(tdb)) {
tdb->map_ptr = mmap(NULL, tdb->map_size,
PROT_READ|(tdb->read_only? 0:PROT_WRITE),
- MAP_SHARED|MAP_FILE, tdb->fd, 0);
+ MAP_SHARED|MAP_FILE, tdb->fd,
+ tdb->hdr_ofs);
/*
* NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
if (tdb->map_ptr == MAP_FAILED) {
tdb->map_ptr = NULL;
- TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n",
+ TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %u (%s)\n",
tdb->map_size, strerror(errno)));
#ifdef HAVE_INCOHERENT_MMAP
tdb->ecode = TDB_ERR_IO;
static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
{
char buf[8192];
+ tdb_off_t new_size;
+ int ret;
if (tdb->read_only || tdb->traverse_read) {
tdb->ecode = TDB_ERR_RDONLY;
return -1;
}
- if (ftruncate(tdb->fd, size+addition) == -1) {
+ if (!tdb_add_off_t(size, addition, &new_size)) {
+ tdb->ecode = TDB_ERR_OOM;
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
+ "overflow detected current size[%u] addition[%u]!\n",
+ (unsigned)size, (unsigned)addition));
+ errno = ENOSPC;
+ return -1;
+ }
+
+#if HAVE_POSIX_FALLOCATE
+ ret = tdb_posix_fallocate(tdb, size, addition);
+ if (ret == 0) {
+ return 0;
+ }
+ if (ret == ENOSPC) {
+ /*
+ * The Linux glibc (at least as of 2.24) fallback if
+ * the file system does not support fallocate does not
+ * reset the file size back to where it was. Also, to
+ * me it is unclear from the posix spec of
+ * posix_fallocate whether this is allowed or
+ * not. Better be safe than sorry and "goto fail" but
+ * "return -1" here, leaving the EOF pointer too
+ * large.
+ */
+ goto fail;
+ }
+
+ /*
+ * Retry the "old" way. Possibly unnecessary, but looking at
+ * our configure script there seem to be weird failure modes
+ * for posix_fallocate. See commit 3264a98ff16de, which
+ * probably refers to
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=1083.
+ */
+#endif
+
+ ret = tdb_ftruncate(tdb, new_size);
+ if (ret == -1) {
char b = 0;
- ssize_t written = pwrite(tdb->fd, &b, 1, (size+addition) - 1);
+ ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1);
if (written == 0) {
/* try once more, potentially revealing errno */
- written = pwrite(tdb->fd, &b, 1, (size+addition) - 1);
+ written = tdb_pwrite(tdb, &b, 1, new_size - 1);
}
if (written == 0) {
/* again - give up, guessing errno */
errno = ENOSPC;
}
if (written != 1) {
- TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n",
- size+addition, strerror(errno)));
+ tdb->ecode = TDB_ERR_OOM;
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %u failed (%s)\n",
+ (unsigned)new_size, strerror(errno)));
return -1;
}
}
memset(buf, TDB_PAD_BYTE, sizeof(buf));
while (addition) {
size_t n = addition>sizeof(buf)?sizeof(buf):addition;
- ssize_t written = pwrite(tdb->fd, buf, n, size);
+ ssize_t written = tdb_pwrite(tdb, buf, n, size);
if (written == 0) {
/* prevent infinite loops: try _once_ more */
- written = pwrite(tdb->fd, buf, n, size);
+ written = tdb_pwrite(tdb, buf, n, size);
}
if (written == 0) {
/* give up, trying to provide a useful errno */
+ tdb->ecode = TDB_ERR_OOM;
TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
"returned 0 twice: giving up!\n"));
errno = ENOSPC;
- return -1;
+ goto fail;
}
if (written == -1) {
+ tdb->ecode = TDB_ERR_OOM;
TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
- "%d bytes failed (%s)\n", (int)n,
+ "%u bytes failed (%s)\n", (int)n,
strerror(errno)));
- return -1;
+ goto fail;
}
if (written != n) {
TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
- "only %d of %d bytes - retrying\n", (int)written,
- (int)n));
+ "only %zu of %zi bytes - retrying\n", written,
+ n));
}
addition -= written;
size += written;
}
return 0;
+
+fail:
+ {
+ int err = errno;
+
+ /*
+ * We're holding the freelist lock or are inside a
+ * transaction. Cutting the file is safe, the space we
+ * tried to allocate can't have been used anywhere in
+ * the meantime.
+ */
+
+ ret = tdb_ftruncate(tdb, size);
+ if (ret == -1) {
+ TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: "
+ "retruncate to %ju failed\n",
+ (uintmax_t)size));
+ }
+ errno = err;
+ }
+
+ return -1;
}
/* You need 'size', this tells you how much you should expand by. */
tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
{
- tdb_off_t new_size, top_size;
+ tdb_off_t new_size, top_size, increment;
+ tdb_off_t max_size = UINT32_MAX - map_size;
+
+ if (size > max_size) {
+ /*
+ * We can't round up anymore, just give back
+ * what we're asked for.
+ *
+ * The caller has to take care of the ENOSPC handling.
+ */
+ return size;
+ }
/* limit size in order to avoid using up huge amounts of memory for
* in memory tdbs if an oddball huge record creeps in */
if (size > 100 * 1024) {
- top_size = map_size + size * 2;
+ increment = size * 2;
} else {
- top_size = map_size + size * 100;
+ increment = size * 100;
+ }
+ if (increment < size) {
+ goto overflow;
+ }
+
+ if (!tdb_add_off_t(map_size, increment, &top_size)) {
+ goto overflow;
}
/* always make room for at least top_size more records, and at
} else {
new_size = map_size * 1.25;
}
+ if (new_size < map_size) {
+ goto overflow;
+ }
/* Round the database up to a multiple of the page size */
new_size = MAX(top_size, new_size);
+
+ if (new_size + page_size < new_size) {
+ /* There's a "+" in TDB_ALIGN that might overflow... */
+ goto overflow;
+ }
+
return TDB_ALIGN(new_size, page_size) - map_size;
+
+overflow:
+ /*
+ * Somewhere in between we went over 4GB. Make one big jump to
+ * exactly 4GB database size.
+ */
+ return max_size;
}
/* expand the database at least size bytes by expanding the underlying
{
struct tdb_record rec;
tdb_off_t offset;
+ tdb_off_t new_size;
if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
/* must know about any previous expansions by another process */
tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+ /*
+ * Note: that we don't care about tdb->hdr_ofs != 0 here
+ *
+ * The 4GB limitation is just related to tdb->map_size
+ * and the offset calculation in the records.
+ *
+ * The file on disk can be up to 4GB + tdb->hdr_ofs
+ */
size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
- /* expand the file itself */
- if (!(tdb->flags & TDB_INTERNAL)) {
- if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
- goto fail;
+ if (!tdb_add_off_t(tdb->map_size, size, &new_size)) {
+ tdb->ecode = TDB_ERR_OOM;
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_expand "
+ "overflow detected current map_size[%u] size[%u]!\n",
+ (unsigned)tdb->map_size, (unsigned)size));
+ goto fail;
}
/* form a new freelist record */
rec.rec_len = size - sizeof(rec);
if (tdb->flags & TDB_INTERNAL) {
- char *new_map_ptr = (char *)realloc(tdb->map_ptr,
- tdb->map_size + size);
+ char *new_map_ptr;
+
+ new_map_ptr = (char *)realloc(tdb->map_ptr, new_size);
if (!new_map_ptr) {
+ tdb->ecode = TDB_ERR_OOM;
goto fail;
}
tdb->map_ptr = new_map_ptr;
- tdb->map_size += size;
+ tdb->map_size = new_size;
} else {
+ int ret;
+
+ /*
+ * expand the file itself
+ */
+ ret = tdb->methods->tdb_expand_file(tdb, tdb->map_size, size);
+ if (ret != 0) {
+ goto fail;
+ }
+
/* Explicitly remap: if we're in a transaction, this won't
* happen automatically! */
tdb_munmap(tdb);
- tdb->map_size += size;
+ tdb->map_size = new_size;
if (tdb_mmap(tdb) != 0) {
goto fail;
}
if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
/* Ensure ecode is set for log fn. */
tdb->ecode = TDB_ERR_OOM;
- TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n",
+ TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%u (%s)\n",
len, strerror(errno)));
return NULL;
}
/* read/write a record */
int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
{
+ int ret;
+ tdb_len_t overall_len;
+
if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
return -1;
if (TDB_BAD_MAGIC(rec)) {
/* Ensure ecode is set for log fn. */
tdb->ecode = TDB_ERR_CORRUPT;
- TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
+ TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%u\n", rec->magic, offset));
+ return -1;
+ }
+
+ overall_len = rec->key_len + rec->data_len;
+ if (overall_len < rec->data_len) {
+ /* overflow */
return -1;
}
+
+ if (overall_len > rec->rec_len) {
+ /* invalid record */
+ return -1;
+ }
+
+ ret = tdb->methods->tdb_oob(tdb, offset, rec->key_len, 1);
+ if (ret == -1) {
+ return -1;
+ }
+ ret = tdb->methods->tdb_oob(tdb, offset, rec->data_len, 1);
+ if (ret == -1) {
+ return -1;
+ }
+ ret = tdb->methods->tdb_oob(tdb, offset, rec->rec_len, 1);
+ if (ret == -1) {
+ return -1;
+ }
+
return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0);
}