2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb_private.h"
27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
28 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
30 struct tdb_header hdr;
33 if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
35 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
39 if (hdr.version != TDB_VERSION)
42 if (hdr.rwlocks != 0 &&
43 hdr.rwlocks != TDB_FEATURE_FLAG_MAGIC &&
44 hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
47 tdb_header_hash(tdb, &h1, &h2);
48 if (hdr.magic1_hash && hdr.magic2_hash &&
49 (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
52 if (hdr.hash_size == 0)
55 if (hdr.hash_size != tdb->hash_size)
58 if (hdr.recovery_start != 0 &&
59 hdr.recovery_start < TDB_DATA_START(tdb->hash_size))
62 *recovery = hdr.recovery_start;
66 tdb->ecode = TDB_ERR_CORRUPT;
67 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
71 /* Generic record header check. */
72 static bool tdb_check_record(struct tdb_context *tdb,
74 const struct tdb_record *rec)
78 /* Check rec->next: 0 or points to record offset, aligned. */
79 if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->hash_size)){
80 TDB_LOG((tdb, TDB_DEBUG_ERROR,
81 "Record offset %u too small next %u\n",
85 if (rec->next + sizeof(*rec) < rec->next) {
86 TDB_LOG((tdb, TDB_DEBUG_ERROR,
87 "Record offset %u too large next %u\n",
91 if ((rec->next % TDB_ALIGNMENT) != 0) {
92 TDB_LOG((tdb, TDB_DEBUG_ERROR,
93 "Record offset %u misaligned next %u\n",
97 if (tdb_oob(tdb, rec->next, sizeof(*rec), 0))
100 /* Check rec_len: similar to rec->next, implies next record. */
101 if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
102 TDB_LOG((tdb, TDB_DEBUG_ERROR,
103 "Record offset %u misaligned length %u\n",
107 /* Must fit tailer. */
108 if (rec->rec_len < sizeof(tailer)) {
109 TDB_LOG((tdb, TDB_DEBUG_ERROR,
110 "Record offset %u too short length %u\n",
114 /* OOB allows "right at the end" access, so this works for last rec. */
115 if (tdb_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0))
119 if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
122 if (tailer != sizeof(*rec) + rec->rec_len) {
123 TDB_LOG((tdb, TDB_DEBUG_ERROR,
124 "Record offset %u invalid tailer\n", off));
131 tdb->ecode = TDB_ERR_CORRUPT;
135 /* Grab some bytes: may copy if can't use mmap.
136 Caller has already done bounds check. */
137 static TDB_DATA get_bytes(struct tdb_context *tdb,
138 tdb_off_t off, tdb_len_t len)
144 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
145 d.dptr = (unsigned char *)tdb->map_ptr + off;
147 d.dptr = tdb_alloc_read(tdb, off, d.dsize);
151 /* Frees data if we're not able to simply use mmap. */
152 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
154 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
159 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
160 * See: http://burtleburtle.net/bob/c/lookup3.c
162 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
163 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
167 /* Set up the internal state */
168 a = b = c = 0xdeadbeef + *pc;
171 c ^= b; c -= rot(b,14);
172 a ^= c; a -= rot(c,11);
173 b ^= a; b -= rot(a,25);
174 c ^= b; c -= rot(b,16);
175 a ^= c; a -= rot(c,4);
176 b ^= a; b -= rot(a,14);
177 c ^= b; c -= rot(b,24);
182 We want to check that all free records are in the free list
183 (only once), and all free list entries are free records. Similarly
184 for each hash chain of used records.
186 Doing that naively (without walking hash chains, since we want to be
187 linear) means keeping a list of records which have been seen in each
188 hash chain, and another of records pointed to (ie. next pointers
189 from records and the initial hash chain heads). These two lists
190 should be equal. This will take 8 bytes per record, and require
193 So instead, we record each offset in a bitmap such a way that
194 recording it twice will cancel out. Since each offset should appear
195 exactly twice, the bitmap should be zero at the end.
197 The approach was inspired by Bloom Filters (see Wikipedia). For
198 each value, we flip K bits in a bitmap of size N. The number of
199 distinct arrangements is:
203 Of course, not all arrangements are actually distinct, but testing
204 shows this formula to be close enough.
206 So, if K == 8 and N == 256, the probability of two things flipping the same
207 bits is 1 in 409,663,695,276,000.
209 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
210 (320k) seems reasonable.
213 #define BITMAP_BITS 256
215 static void bit_flip(unsigned char bits[], unsigned int idx)
217 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
220 /* We record offsets in a bitmap for the particular chain it should be in. */
221 static void record_offset(unsigned char bits[], tdb_off_t off)
223 uint32_t h1 = off, h2 = 0;
226 /* We get two good hash values out of jhash2, so we use both. Then
227 * we keep going to produce further hash values. */
228 for (i = 0; i < NUM_HASHES / 2; i++) {
230 bit_flip(bits, h1 % BITMAP_BITS);
231 bit_flip(bits, h2 % BITMAP_BITS);
236 /* Check that an in-use record is valid. */
237 static bool tdb_check_used_record(struct tdb_context *tdb,
239 const struct tdb_record *rec,
240 unsigned char **hashes,
241 int (*check)(TDB_DATA, TDB_DATA, void *),
247 if (!tdb_check_record(tdb, off, rec))
250 /* key + data + tailer must fit in record */
252 len += rec->data_len;
253 if (len < rec->data_len) {
255 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n"));
258 len += sizeof(tdb_off_t);
259 if (len < sizeof(tdb_off_t)) {
261 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n"));
265 if (len > rec->rec_len) {
266 TDB_LOG((tdb, TDB_DEBUG_ERROR,
267 "Record offset %u too short for contents\n", off));
271 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
275 if (tdb->hash_fn(&key) != rec->full_hash) {
276 TDB_LOG((tdb, TDB_DEBUG_ERROR,
277 "Record offset %u has incorrect hash\n", off));
281 /* Mark this offset as a known value for this hash bucket. */
282 record_offset(hashes[BUCKET(rec->full_hash)+1], off);
283 /* And similarly if the next pointer is valid. */
285 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
287 /* If they supply a check function and this record isn't dead,
288 get data and feed it. */
289 if (check && rec->magic != TDB_DEAD_MAGIC) {
290 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
295 if (check(key, data, private_data) == -1)
297 put_bytes(tdb, data);
304 put_bytes(tdb, data);
310 /* Check that an unused record is valid. */
311 static bool tdb_check_free_record(struct tdb_context *tdb,
313 const struct tdb_record *rec,
314 unsigned char **hashes)
316 if (!tdb_check_record(tdb, off, rec))
319 /* Mark this offset as a known value for the free list. */
320 record_offset(hashes[0], off);
321 /* And similarly if the next pointer is valid. */
323 record_offset(hashes[0], rec->next);
327 /* Slow, but should be very rare. */
328 size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off)
332 for (len = 0; off + len < tdb->map_size; len++) {
334 if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
336 if (c != 0 && c != 0x42)
342 _PUBLIC_ int tdb_check(struct tdb_context *tdb,
343 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
347 unsigned char **hashes;
348 tdb_off_t off, recovery_start;
349 struct tdb_record rec;
350 bool found_recovery = false;
354 /* Read-only databases use no locking at all: it's best-effort.
355 * We may have a write lock already, so skip that case too. */
356 if (tdb->read_only || tdb->allrecord_lock.count != 0) {
359 if (tdb_lockall_read(tdb) == -1)
364 /* Make sure we know true size of the underlying file. */
365 tdb_oob(tdb, tdb->map_size, 1, 1);
367 /* Header must be OK: also gets us the recovery ptr, if any. */
368 if (!tdb_check_header(tdb, &recovery_start))
371 /* We should have the whole header, too. */
372 if (tdb->map_size < TDB_DATA_START(tdb->hash_size)) {
373 tdb->ecode = TDB_ERR_CORRUPT;
374 TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
378 /* One big malloc: pointers then bit arrays. */
379 hashes = (unsigned char **)calloc(
380 1, sizeof(hashes[0]) * (1+tdb->hash_size)
381 + BITMAP_BITS / CHAR_BIT * (1+tdb->hash_size));
383 tdb->ecode = TDB_ERR_OOM;
387 /* Initialize pointers */
388 hashes[0] = (unsigned char *)(&hashes[1+tdb->hash_size]);
389 for (h = 1; h < 1+tdb->hash_size; h++)
390 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
392 /* Freelist and hash headers are all in a row: read them. */
393 for (h = 0; h < 1+tdb->hash_size; h++) {
394 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
398 record_offset(hashes[h], off);
401 /* For each record, read it in and check it's ok. */
402 for (off = TDB_DATA_START(tdb->hash_size);
404 off += sizeof(rec) + rec.rec_len) {
405 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
411 if (!tdb_check_used_record(tdb, off, &rec, hashes,
412 check, private_data))
416 if (!tdb_check_free_record(tdb, off, &rec, hashes))
419 /* If we crash after ftruncate, we can get zeroes or fill. */
420 case TDB_RECOVERY_INVALID_MAGIC:
422 if (recovery_start == off) {
423 found_recovery = true;
426 dead = tdb_dead_space(tdb, off);
427 if (dead < sizeof(rec))
430 TDB_LOG((tdb, TDB_DEBUG_ERROR,
431 "Dead space at %u-%u (of %u)\n",
432 off, off + dead, tdb->map_size));
433 rec.rec_len = dead - sizeof(rec);
435 case TDB_RECOVERY_MAGIC:
436 if (recovery_start != off) {
437 TDB_LOG((tdb, TDB_DEBUG_ERROR,
438 "Unexpected recovery record at offset %u\n",
442 found_recovery = true;
446 tdb->ecode = TDB_ERR_CORRUPT;
447 TDB_LOG((tdb, TDB_DEBUG_ERROR,
448 "Bad magic 0x%x at offset %u\n",
454 /* Now, hashes should all be empty: each record exists and is referred
455 * to by one other. */
456 for (h = 0; h < 1+tdb->hash_size; h++) {
458 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
459 if (hashes[h][i] != 0) {
460 tdb->ecode = TDB_ERR_CORRUPT;
461 TDB_LOG((tdb, TDB_DEBUG_ERROR,
462 "Hashes do not match records\n"));
468 /* We must have found recovery area if there was one. */
469 if (recovery_start != 0 && !found_recovery) {
470 TDB_LOG((tdb, TDB_DEBUG_ERROR,
471 "Expected a recovery area at %u\n",
478 tdb_unlockall_read(tdb);
486 tdb_unlockall_read(tdb);