X-Git-Url: http://git.samba.org/?p=samba.git;a=blobdiff_plain;f=lib%2Ftdb%2Fcommon%2Fio.c;h=94b316331c123396581722c47dd2b4faee42313b;hp=a2db3bf4bd81d45f40c42bc6c51c5f2fb31cc159;hb=df2a036377ad68a999cbccd6e2ba813fa48e7cb9;hpb=b64494535dc62f4073fc6302847593ed6e6ec38b diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c index a2db3bf4bd8..94b316331c1 100644 --- a/lib/tdb/common/io.c +++ b/lib/tdb/common/io.c @@ -1,4 +1,4 @@ - /* + /* Unix SMB/CIFS implementation. trivial database library @@ -28,9 +28,113 @@ #include "tdb_private.h" +/* + * We prepend the mutex area, so fixup offsets. See mutex.c for details. + * tdb->hdr_ofs is 0 or header.mutex_size. + * + * Note: that we only have the 4GB limit of tdb_off_t for + * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs! + */ + +static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off) +{ + off_t tmp = tdb->hdr_ofs + *off; + + if ((tmp < tdb->hdr_ofs) || (tmp < *off)) { + errno = EIO; + return false; + } + + *off = tmp; + return true; +} + +static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf, + size_t count, off_t offset) +{ + ssize_t ret; + + if (!tdb_adjust_offset(tdb, &offset)) { + return -1; + } + + do { + ret = pwrite(tdb->fd, buf, count, offset); + } while ((ret == -1) && (errno == EINTR)); + + return ret; +} + +static ssize_t tdb_pread(struct tdb_context *tdb, void *buf, + size_t count, off_t offset) +{ + ssize_t ret; + + if (!tdb_adjust_offset(tdb, &offset)) { + return -1; + } + + do { + ret = pread(tdb->fd, buf, count, offset); + } while ((ret == -1) && (errno == EINTR)); + + return ret; +} + +static int tdb_ftruncate(struct tdb_context *tdb, off_t length) +{ + ssize_t ret; + + if (!tdb_adjust_offset(tdb, &length)) { + return -1; + } + + do { + ret = ftruncate(tdb->fd, length); + } while ((ret == -1) && (errno == EINTR)); + + return ret; +} + +#if HAVE_POSIX_FALLOCATE +static int tdb_posix_fallocate(struct tdb_context *tdb, off_t offset, + off_t len) +{ + ssize_t ret; + + if (!tdb_adjust_offset(tdb, &offset)) { + return -1; + } + + do { + ret = posix_fallocate(tdb->fd, offset, len); + } while ((ret == -1) && (errno == EINTR)); + + return ret; +} +#endif + +static int tdb_fstat(struct tdb_context *tdb, struct stat *buf) +{ + int ret; + + ret = fstat(tdb->fd, buf); + if (ret == -1) { + return -1; + } + + if (buf->st_size < tdb->hdr_ofs) { + errno = EIO; + return -1; + } + buf->st_size -= tdb->hdr_ofs; + + return ret; +} + /* check for an out of bounds access - if it is out of bounds then see if the database has been expanded by someone else and expand - if necessary + if necessary */ static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len, int probe) @@ -40,8 +144,8 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len, if (!probe) { /* Ensure ecode is set for log fn. */ tdb->ecode = TDB_ERR_IO; - TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %d len %d wrap\n", - (int)off, (int)len)); + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %u len %u wrap\n", + off, len)); } return -1; } @@ -58,21 +162,11 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len, return -1; } - if (fstat(tdb->fd, &st) == -1) { + if (tdb_fstat(tdb, &st) == -1) { tdb->ecode = TDB_ERR_IO; return -1; } - if (st.st_size < (size_t)off + len) { - if (!probe) { - /* Ensure ecode is set for log fn. */ - tdb->ecode = TDB_ERR_IO; - TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n", - (int)(off + len), (int)st.st_size)); - } - return -1; - } - /* Beware >4G files! */ if ((tdb_off_t)st.st_size != st.st_size) { /* Ensure ecode is set for log fn. */ @@ -82,18 +176,35 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len, return -1; } - /* Unmap, update size, remap */ + /* Unmap, update size, remap. We do this unconditionally, to handle + * the unusual case where the db is truncated. + * + * This can happen to a child using tdb_reopen_all(true) on a + * TDB_CLEAR_IF_FIRST tdb whose parent crashes: the next + * opener will truncate the database. */ if (tdb_munmap(tdb) == -1) { tdb->ecode = TDB_ERR_IO; return -1; } tdb->map_size = st.st_size; - tdb_mmap(tdb); + if (tdb_mmap(tdb) != 0) { + return -1; + } + + if (st.st_size < (size_t)off + len) { + if (!probe) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n", + (int)(off + len), (int)st.st_size)); + } + return -1; + } return 0; } /* write a lump of data at a specified offset */ -static int tdb_write(struct tdb_context *tdb, tdb_off_t off, +static int tdb_write(struct tdb_context *tdb, tdb_off_t off, const void *buf, tdb_len_t len) { if (len == 0) { @@ -111,30 +222,37 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off, if (tdb->map_ptr) { memcpy(off + (char *)tdb->map_ptr, buf, len); } else { - ssize_t written = pwrite(tdb->fd, buf, len, off); +#ifdef HAVE_INCOHERENT_MMAP + tdb->ecode = TDB_ERR_IO; + return -1; +#else + ssize_t written; + + written = tdb_pwrite(tdb, buf, len, off); + if ((written != (ssize_t)len) && (written != -1)) { /* try once more */ tdb->ecode = TDB_ERR_IO; TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only " - "%d of %d bytes at %d, trying once more\n", - (int)written, len, off)); - written = pwrite(tdb->fd, (const char *)buf+written, - len-written, - off+written); + "%zi of %u bytes at %u, trying once more\n", + written, len, off)); + written = tdb_pwrite(tdb, (const char *)buf+written, + len-written, off+written); } if (written == -1) { /* Ensure ecode is set for log fn. */ tdb->ecode = TDB_ERR_IO; - TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d " - "len=%d (%s)\n", off, len, strerror(errno))); + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %u " + "len=%u (%s)\n", off, len, strerror(errno))); return -1; } else if (written != (ssize_t)len) { tdb->ecode = TDB_ERR_IO; TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to " - "write %d bytes at %d in two attempts\n", + "write %u bytes at %u in two attempts\n", len, off)); return -1; } +#endif } return 0; } @@ -150,7 +268,7 @@ void *tdb_convert(void *buf, uint32_t size) /* read a lump of data at a specified offset, maybe convert */ -static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, +static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, tdb_len_t len, int cv) { if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0) { @@ -160,16 +278,23 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, if (tdb->map_ptr) { memcpy(buf, off + (char *)tdb->map_ptr, len); } else { - ssize_t ret = pread(tdb->fd, buf, len, off); +#ifdef HAVE_INCOHERENT_MMAP + tdb->ecode = TDB_ERR_IO; + return -1; +#else + ssize_t ret; + + ret = tdb_pread(tdb, buf, len, off); if (ret != (ssize_t)len) { /* Ensure ecode is set for log fn. */ tdb->ecode = TDB_ERR_IO; - TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d " - "len=%d ret=%d (%s) map_size=%d\n", - (int)off, (int)len, (int)ret, strerror(errno), - (int)tdb->map_size)); + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %u " + "len=%u ret=%zi (%s) map_size=%u\n", + off, len, ret, strerror(errno), + tdb->map_size)); return -1; } +#endif } if (cv) { tdb_convert(buf, len); @@ -182,19 +307,19 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, /* do an unlocked scan of the hash table heads to find the next non-zero head. The value will then be confirmed with the lock held -*/ +*/ static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain) { uint32_t h = *chain; if (tdb->map_ptr) { - for (;h < tdb->header.hash_size;h++) { + for (;h < tdb->hash_size;h++) { if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) { break; } } } else { uint32_t off=0; - for (;h < tdb->header.hash_size;h++) { + for (;h < tdb->hash_size;h++) { if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) { break; } @@ -222,16 +347,27 @@ int tdb_munmap(struct tdb_context *tdb) return 0; } -void tdb_mmap(struct tdb_context *tdb) +/* If mmap isn't coherent, *everyone* must always mmap. */ +static bool should_mmap(const struct tdb_context *tdb) +{ +#ifdef HAVE_INCOHERENT_MMAP + return true; +#else + return !(tdb->flags & TDB_NOMMAP); +#endif +} + +int tdb_mmap(struct tdb_context *tdb) { if (tdb->flags & TDB_INTERNAL) - return; + return 0; #ifdef HAVE_MMAP - if (!(tdb->flags & TDB_NOMMAP)) { - tdb->map_ptr = mmap(NULL, tdb->map_size, - PROT_READ|(tdb->read_only? 0:PROT_WRITE), - MAP_SHARED|MAP_FILE, tdb->fd, 0); + if (should_mmap(tdb)) { + tdb->map_ptr = mmap(NULL, tdb->map_size, + PROT_READ|(tdb->read_only? 0:PROT_WRITE), + MAP_SHARED|MAP_FILE, tdb->fd, + tdb->hdr_ofs); /* * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!! @@ -239,8 +375,12 @@ void tdb_mmap(struct tdb_context *tdb) if (tdb->map_ptr == MAP_FAILED) { tdb->map_ptr = NULL; - TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n", + TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %u (%s)\n", tdb->map_size, strerror(errno))); +#ifdef HAVE_INCOHERENT_MMAP + tdb->ecode = TDB_ERR_IO; + return -1; +#endif } } else { tdb->map_ptr = NULL; @@ -248,6 +388,7 @@ void tdb_mmap(struct tdb_context *tdb) #else tdb->map_ptr = NULL; #endif + return 0; } /* expand a file. we prefer to use ftruncate, as that is what posix @@ -255,26 +396,67 @@ void tdb_mmap(struct tdb_context *tdb) static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition) { char buf[8192]; + tdb_off_t new_size; + int ret; if (tdb->read_only || tdb->traverse_read) { tdb->ecode = TDB_ERR_RDONLY; return -1; } - if (ftruncate(tdb->fd, size+addition) == -1) { + if (!tdb_add_off_t(size, addition, &new_size)) { + tdb->ecode = TDB_ERR_OOM; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write " + "overflow detected current size[%u] addition[%u]!\n", + (unsigned)size, (unsigned)addition)); + errno = ENOSPC; + return -1; + } + +#if HAVE_POSIX_FALLOCATE + ret = tdb_posix_fallocate(tdb, size, addition); + if (ret == 0) { + return 0; + } + if (ret == ENOSPC) { + /* + * The Linux glibc (at least as of 2.24) fallback if + * the file system does not support fallocate does not + * reset the file size back to where it was. Also, to + * me it is unclear from the posix spec of + * posix_fallocate whether this is allowed or + * not. Better be safe than sorry and "goto fail" but + * "return -1" here, leaving the EOF pointer too + * large. + */ + goto fail; + } + + /* + * Retry the "old" way. Possibly unnecessary, but looking at + * our configure script there seem to be weird failure modes + * for posix_fallocate. See commit 3264a98ff16de, which + * probably refers to + * https://sourceware.org/bugzilla/show_bug.cgi?id=1083. + */ +#endif + + ret = tdb_ftruncate(tdb, new_size); + if (ret == -1) { char b = 0; - ssize_t written = pwrite(tdb->fd, &b, 1, (size+addition) - 1); + ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1); if (written == 0) { /* try once more, potentially revealing errno */ - written = pwrite(tdb->fd, &b, 1, (size+addition) - 1); + written = tdb_pwrite(tdb, &b, 1, new_size - 1); } if (written == 0) { /* again - give up, guessing errno */ errno = ENOSPC; } if (written != 1) { - TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n", - size+addition, strerror(errno))); + tdb->ecode = TDB_ERR_OOM; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %u failed (%s)\n", + (unsigned)new_size, strerror(errno))); return -1; } } @@ -285,112 +467,191 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad memset(buf, TDB_PAD_BYTE, sizeof(buf)); while (addition) { size_t n = addition>sizeof(buf)?sizeof(buf):addition; - ssize_t written = pwrite(tdb->fd, buf, n, size); + ssize_t written = tdb_pwrite(tdb, buf, n, size); if (written == 0) { /* prevent infinite loops: try _once_ more */ - written = pwrite(tdb->fd, buf, n, size); + written = tdb_pwrite(tdb, buf, n, size); } if (written == 0) { /* give up, trying to provide a useful errno */ + tdb->ecode = TDB_ERR_OOM; TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write " "returned 0 twice: giving up!\n")); errno = ENOSPC; - return -1; - } else if (written == -1) { + goto fail; + } + if (written == -1) { + tdb->ecode = TDB_ERR_OOM; TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of " - "%d bytes failed (%s)\n", (int)n, + "%u bytes failed (%s)\n", (int)n, strerror(errno))); - return -1; - } else if (written != n) { + goto fail; + } + if (written != n) { TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote " - "only %d of %d bytes - retrying\n", (int)written, - (int)n)); + "only %zu of %zi bytes - retrying\n", written, + n)); } addition -= written; size += written; } return 0; + +fail: + { + int err = errno; + + /* + * We're holding the freelist lock or are inside a + * transaction. Cutting the file is safe, the space we + * tried to allocate can't have been used anywhere in + * the meantime. + */ + + ret = tdb_ftruncate(tdb, size); + if (ret == -1) { + TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: " + "retruncate to %ju failed\n", + (uintmax_t)size)); + } + errno = err; + } + + return -1; } -/* expand the database at least size bytes by expanding the underlying - file and doing the mmap again if necessary */ -int tdb_expand(struct tdb_context *tdb, tdb_off_t size) +/* You need 'size', this tells you how much you should expand by. */ +tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size) { - struct tdb_record rec; - tdb_off_t offset, new_size, top_size, map_size; + tdb_off_t new_size, top_size, increment; + tdb_off_t max_size = UINT32_MAX - map_size; - if (tdb_lock(tdb, -1, F_WRLCK) == -1) { - TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n")); - return -1; + if (size > max_size) { + /* + * We can't round up anymore, just give back + * what we're asked for. + * + * The caller has to take care of the ENOSPC handling. + */ + return size; } - /* must know about any previous expansions by another process */ - tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1); - /* limit size in order to avoid using up huge amounts of memory for * in memory tdbs if an oddball huge record creeps in */ if (size > 100 * 1024) { - top_size = tdb->map_size + size * 2; + increment = size * 2; } else { - top_size = tdb->map_size + size * 100; + increment = size * 100; + } + if (increment < size) { + goto overflow; + } + + if (!tdb_add_off_t(map_size, increment, &top_size)) { + goto overflow; } /* always make room for at least top_size more records, and at least 25% more space. if the DB is smaller than 100MiB, otherwise grow it by 10% only. */ - if (tdb->map_size > 100 * 1024 * 1024) { - map_size = tdb->map_size * 1.10; + if (map_size > 100 * 1024 * 1024) { + new_size = map_size * 1.10; } else { - map_size = tdb->map_size * 1.25; + new_size = map_size * 1.25; + } + if (new_size < map_size) { + goto overflow; } /* Round the database up to a multiple of the page size */ - new_size = MAX(top_size, map_size); - size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size; + new_size = MAX(top_size, new_size); - if (!(tdb->flags & TDB_INTERNAL)) - tdb_munmap(tdb); + if (new_size + page_size < new_size) { + /* There's a "+" in TDB_ALIGN that might overflow... */ + goto overflow; + } + + return TDB_ALIGN(new_size, page_size) - map_size; +overflow: /* - * We must ensure the file is unmapped before doing this - * to ensure consistency with systems like OpenBSD where - * writes and mmaps are not consistent. + * Somewhere in between we went over 4GB. Make one big jump to + * exactly 4GB database size. */ + return max_size; +} - /* expand the file itself */ - if (!(tdb->flags & TDB_INTERNAL)) { - if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0) - goto fail; +/* expand the database at least size bytes by expanding the underlying + file and doing the mmap again if necessary */ +int tdb_expand(struct tdb_context *tdb, tdb_off_t size) +{ + struct tdb_record rec; + tdb_off_t offset; + tdb_off_t new_size; + + if (tdb_lock(tdb, -1, F_WRLCK) == -1) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n")); + return -1; + } + + /* must know about any previous expansions by another process */ + tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1); + + /* + * Note: that we don't care about tdb->hdr_ofs != 0 here + * + * The 4GB limitation is just related to tdb->map_size + * and the offset calculation in the records. + * + * The file on disk can be up to 4GB + tdb->hdr_ofs + */ + size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size); + + if (!tdb_add_off_t(tdb->map_size, size, &new_size)) { + tdb->ecode = TDB_ERR_OOM; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_expand " + "overflow detected current map_size[%u] size[%u]!\n", + (unsigned)tdb->map_size, (unsigned)size)); + goto fail; } - tdb->map_size += size; + /* form a new freelist record */ + offset = tdb->map_size; + memset(&rec,'\0',sizeof(rec)); + rec.rec_len = size - sizeof(rec); if (tdb->flags & TDB_INTERNAL) { - char *new_map_ptr = (char *)realloc(tdb->map_ptr, - tdb->map_size); + char *new_map_ptr; + + new_map_ptr = (char *)realloc(tdb->map_ptr, new_size); if (!new_map_ptr) { - tdb->map_size -= size; + tdb->ecode = TDB_ERR_OOM; goto fail; } tdb->map_ptr = new_map_ptr; + tdb->map_size = new_size; } else { + int ret; + /* - * We must ensure the file is remapped before adding the space - * to ensure consistency with systems like OpenBSD where - * writes and mmaps are not consistent. + * expand the file itself */ + ret = tdb->methods->tdb_expand_file(tdb, tdb->map_size, size); + if (ret != 0) { + goto fail; + } - /* We're ok if the mmap fails as we'll fallback to read/write */ - tdb_mmap(tdb); + /* Explicitly remap: if we're in a transaction, this won't + * happen automatically! */ + tdb_munmap(tdb); + tdb->map_size = new_size; + if (tdb_mmap(tdb) != 0) { + goto fail; + } } - /* form a new freelist record */ - memset(&rec,'\0',sizeof(rec)); - rec.rec_len = size - sizeof(rec); - /* link it into the free list */ - offset = tdb->map_size - size; if (tdb_free(tdb, offset, &rec) == -1) goto fail; @@ -424,7 +685,7 @@ unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len if (!(buf = (unsigned char *)malloc(len ? len : 1))) { /* Ensure ecode is set for log fn. */ tdb->ecode = TDB_ERR_OOM; - TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n", + TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%u (%s)\n", len, strerror(errno))); return NULL; } @@ -472,14 +733,42 @@ int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key, /* read/write a record */ int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec) { + int ret; + tdb_len_t overall_len; + if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1) return -1; if (TDB_BAD_MAGIC(rec)) { /* Ensure ecode is set for log fn. */ tdb->ecode = TDB_ERR_CORRUPT; - TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset)); + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%u\n", rec->magic, offset)); + return -1; + } + + overall_len = rec->key_len + rec->data_len; + if (overall_len < rec->data_len) { + /* overflow */ + return -1; + } + + if (overall_len > rec->rec_len) { + /* invalid record */ + return -1; + } + + ret = tdb->methods->tdb_oob(tdb, offset, rec->key_len, 1); + if (ret == -1) { + return -1; + } + ret = tdb->methods->tdb_oob(tdb, offset, rec->data_len, 1); + if (ret == -1) { return -1; } + ret = tdb->methods->tdb_oob(tdb, offset, rec->rec_len, 1); + if (ret == -1) { + return -1; + } + return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0); }