2 Trivial Database 2: hash handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/hash/hash.h>
21 /* Default hash function. */
22 uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
25 return hash_stable((const unsigned char *)key, length, seed);
28 uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
30 return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
33 static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
34 const struct ntdb_used_record *rec,
38 ntdb_bool_err ret = false;
41 if (rec_key_length(rec) != key->dsize) {
42 ntdb->stats.compare_wrong_keylen++;
46 rkey = ntdb_access_read(ntdb, off + sizeof(*rec), key->dsize, false);
47 if (NTDB_PTR_IS_ERR(rkey)) {
48 return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
50 if (memcmp(rkey, key->dptr, key->dsize) == 0)
53 ntdb->stats.compare_wrong_keycmp++;
54 ntdb_access_release(ntdb, rkey);
58 /* Does entry match? */
59 static ntdb_bool_err match(struct ntdb_context *ntdb,
63 struct ntdb_used_record *rec,
64 const ntdb_off_t **mapped)
67 enum NTDB_ERROR ecode;
69 ntdb->stats.compares++;
71 /* Top bits of offset == next bits of hash. */
72 if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
73 != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
74 ntdb->stats.compare_wrong_offsetbits++;
78 /* Unmap before we try to read actual record, which may cause expand */
80 ntdb_access_release(ntdb, *mapped);
84 off = val & NTDB_OFF_MASK;
85 ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
86 if (ecode != NTDB_SUCCESS) {
87 return (ntdb_bool_err)ecode;
90 return key_matches(ntdb, rec, off, key);
93 static bool is_chain(ntdb_off_t val)
95 return val & (1ULL << NTDB_OFF_CHAIN_BIT);
98 static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
100 return base + sizeof(struct ntdb_used_record)
101 + idx * sizeof(ntdb_off_t);
104 /* This is the core routine which searches the hashtable for an entry.
105 * On error, no locks are held and -ve is returned.
106 * Otherwise, hinfo is filled in.
107 * If not found, the return value is 0.
108 * If found, the return value is the offset, and *rec is the record. */
109 ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
113 struct ntdb_used_record *rec)
116 const ntdb_off_t *arr = NULL;
119 enum NTDB_ERROR ecode;
120 struct ntdb_used_record chdr;
123 h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
125 h->table = NTDB_HASH_OFFSET;
126 h->table_size = 1 << ntdb->hash_bits;
127 h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
130 ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
131 if (ecode != NTDB_SUCCESS) {
132 return NTDB_ERR_TO_OFF(ecode);
135 off = hbucket_off(h->table, h->bucket);
136 val = ntdb_read_off(ntdb, off);
137 if (NTDB_OFF_IS_ERR(val)) {
138 ecode = NTDB_OFF_TO_ERR(val);
142 /* Directly in hash table? */
143 if (!is_chain(val)) {
145 berr = match(ntdb, h->h, &key, val, rec, NULL);
147 ecode = NTDB_OFF_TO_ERR(berr);
151 return val & NTDB_OFF_MASK;
153 /* If you want to insert here, make a chain. */
159 /* Nope? Iterate through chain. */
160 h->table = val & NTDB_OFF_MASK;
162 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
163 if (ecode != NTDB_SUCCESS) {
167 if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
168 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
171 " corrupt record %#x at %llu",
172 rec_magic(&chdr), (long long)off);
176 h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
179 for (i = 0; i < h->table_size; i++) {
180 /* Careful! match has to unmap this if we access a
181 * record (may cause mmap of database to move. */
183 arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
184 rec_data_length(&chdr), true);
185 if (NTDB_PTR_IS_ERR(arr)) {
186 ecode = NTDB_PTR_ERR(arr);
198 berr = match(ntdb, h->h, &key, val, rec, &arr);
200 ecode = NTDB_OFF_TO_ERR(berr);
202 ntdb_access_release(ntdb, arr);
209 off = val & NTDB_OFF_MASK;
211 ntdb_access_release(ntdb, arr);
218 /* Set to any non-zero value */
224 ntdb_access_release(ntdb, arr);
229 ntdb_unlock_hash(ntdb, h->bucket, ltype);
230 return NTDB_ERR_TO_OFF(ecode);
233 static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
234 ntdb_off_t new_off, uint32_t hash)
238 assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
239 assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
240 /* We pack extra hash bits into the upper bits of the offset. */
241 extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
242 extra <<= (64 - NTDB_OFF_UPPER_STEAL);
244 return new_off | extra;
247 /* Simply overwrite the hash entry we found before. */
248 enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
249 const struct hash_info *h,
252 return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
253 encode_offset(ntdb, new_off, h->h));
256 enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
257 const struct hash_info *h)
259 return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
263 enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
264 const struct hash_info *h,
267 enum NTDB_ERROR ecode;
269 struct ntdb_used_record chdr;
270 const ntdb_off_t *old;
273 /* We hit an empty bucket during search? That's where it goes. */
275 return replace_in_hash(ntdb, h, new_off);
278 /* Full at top-level? Create a 2-element chain. */
279 if (h->table == NTDB_HASH_OFFSET) {
282 /* One element is old value, the other is the new value. */
283 pair[0] = h->old_val;
284 pair[1] = encode_offset(ntdb, new_off, h->h);
286 chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
287 if (NTDB_OFF_IS_ERR(chain)) {
288 return NTDB_OFF_TO_ERR(chain);
290 ecode = ntdb_write_convert(ntdb,
292 + sizeof(struct ntdb_used_record),
294 if (ecode == NTDB_SUCCESS) {
295 ecode = ntdb_write_off(ntdb,
296 hbucket_off(h->table, h->bucket),
298 | (1ULL << NTDB_OFF_CHAIN_BIT));
303 /* Full bucket. Expand. */
304 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
305 if (ecode != NTDB_SUCCESS) {
309 if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
310 /* Expand in place. */
311 uint64_t dlen = rec_data_length(&chdr);
313 ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
314 dlen + sizeof(new_off),
315 dlen + rec_extra_padding(&chdr));
317 if (ecode != NTDB_SUCCESS) {
320 /* find_and_lock set up h to point to last bucket. */
321 ecode = replace_in_hash(ntdb, h, new_off);
322 if (ecode != NTDB_SUCCESS) {
325 ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
326 if (ecode != NTDB_SUCCESS) {
329 /* For futureproofing, we always make the first byte of padding
331 if (rec_extra_padding(&chdr)) {
332 ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
333 + dlen + sizeof(new_off),
339 /* We need to reallocate the chain. */
340 chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
341 NTDB_CHAIN_MAGIC, true);
342 if (NTDB_OFF_IS_ERR(chain)) {
343 return NTDB_OFF_TO_ERR(chain);
346 /* Map both and copy across old buckets. */
347 old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
348 h->table_size*sizeof(ntdb_off_t), true);
349 if (NTDB_PTR_IS_ERR(old)) {
350 return NTDB_PTR_ERR(old);
352 new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
353 (h->table_size + 1)*sizeof(ntdb_off_t), true);
354 if (NTDB_PTR_IS_ERR(new)) {
355 ntdb_access_release(ntdb, old);
356 return NTDB_PTR_ERR(new);
359 memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
360 new[h->bucket] = encode_offset(ntdb, new_off, h->h);
361 ntdb_access_release(ntdb, old);
363 ecode = ntdb_access_commit(ntdb, new);
364 if (ecode != NTDB_SUCCESS) {
368 /* Free the old chain. */
369 ecode = add_free_record(ntdb, h->table,
370 sizeof(struct ntdb_used_record)
371 + rec_data_length(&chdr)
372 + rec_extra_padding(&chdr),
373 NTDB_LOCK_WAIT, true);
375 /* Replace top-level to point to new chain */
376 return ntdb_write_off(ntdb,
377 hbucket_off(NTDB_HASH_OFFSET,
378 bits_from(h->h, 0, ntdb->hash_bits)),
379 chain | (1ULL << NTDB_OFF_CHAIN_BIT));
382 /* Traverse support: returns offset of record, or 0 or -ve error. */
383 static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
388 enum NTDB_ERROR ecode;
389 struct ntdb_used_record chdr;
391 /* First load up chain header. */
392 h->table = val & NTDB_OFF_MASK;
393 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
394 if (ecode != NTDB_SUCCESS) {
398 if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
399 return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
402 " corrupt record %#x at %llu",
404 (long long)h->table);
407 /* Chain length is implied by data length. */
408 h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
410 i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
412 if (NTDB_OFF_IS_ERR(i)) {
416 if (i != h->table_size) {
417 /* Return to next bucket. */
419 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
420 if (NTDB_OFF_IS_ERR(val)) {
423 return val & NTDB_OFF_MASK;
426 /* Go back up to hash table. */
427 h->table = NTDB_HASH_OFFSET;
428 h->table_size = 1 << ntdb->hash_bits;
429 h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
433 /* Keeps hash locked unless returns 0 or error. */
434 static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
438 enum NTDB_ERROR ecode;
440 if (h->table != NTDB_HASH_OFFSET) {
441 /* We're in a chain. */
442 i = bits_from(h->h, 0, ntdb->hash_bits);
443 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
444 if (ecode != NTDB_SUCCESS) {
445 return NTDB_ERR_TO_OFF(ecode);
448 /* We dropped lock, bucket might have moved! */
449 val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
450 if (NTDB_OFF_IS_ERR(val)) {
454 /* We don't remove chains: there should still be one there! */
455 if (!val || !is_chain(val)) {
456 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
459 " vanished hchain %llu at %llu",
462 val = NTDB_ERR_TO_OFF(ecode);
466 /* Find next bucket in the chain. */
467 val = iterate_chain(ntdb, val, h);
468 if (NTDB_OFF_IS_ERR(val)) {
474 ntdb_unlock_hash(ntdb, i, F_RDLCK);
476 /* OK, we've reset h back to top level. */
479 /* We do this unlocked, then re-check. */
480 for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
481 h->bucket, h->table_size);
483 i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
484 i+1, h->table_size)) {
485 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
486 if (ecode != NTDB_SUCCESS) {
487 return NTDB_ERR_TO_OFF(ecode);
490 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
491 if (NTDB_OFF_IS_ERR(val)) {
495 /* Lost race, and it's empty? */
497 ntdb->stats.traverse_val_vanished++;
498 ntdb_unlock_hash(ntdb, i, F_RDLCK);
502 if (!is_chain(val)) {
503 /* So caller knows what lock to free. */
505 /* Return to next bucket. */
507 val &= NTDB_OFF_MASK;
511 /* Start at beginning of chain */
515 val = iterate_chain(ntdb, val, h);
516 if (NTDB_OFF_IS_ERR(val)) {
523 /* Otherwise, bucket has been set to i+1 */
524 ntdb_unlock_hash(ntdb, i, F_RDLCK);
529 ntdb_unlock_hash(ntdb, i, F_RDLCK);
533 /* Return success if we find something, NTDB_ERR_NOEXIST if none. */
534 enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
536 NTDB_DATA *kbuf, size_t *dlen)
539 struct ntdb_used_record rec;
540 enum NTDB_ERROR ecode;
542 off = lock_and_iterate_hash(ntdb, h);
544 if (NTDB_OFF_IS_ERR(off)) {
545 return NTDB_OFF_TO_ERR(off);
546 } else if (off == 0) {
547 return NTDB_ERR_NOEXIST;
550 /* The hash for this key is still locked. */
551 ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
552 if (ecode != NTDB_SUCCESS) {
555 if (rec_magic(&rec) != NTDB_USED_MAGIC) {
556 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
559 " corrupt record at %llu",
564 kbuf->dsize = rec_key_length(&rec);
566 /* They want data as well? */
568 *dlen = rec_data_length(&rec);
569 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
570 kbuf->dsize + *dlen);
572 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
575 if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
576 ecode = NTDB_PTR_ERR(kbuf->dptr);
579 ecode = NTDB_SUCCESS;
582 ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
587 enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
589 NTDB_DATA *kbuf, size_t *dlen)
591 h->table = NTDB_HASH_OFFSET;
592 h->table_size = 1 << ntdb->hash_bits;
595 return next_in_hash(ntdb, h, kbuf, dlen);
598 /* Even if the entry isn't in this hash bucket, you'd have to lock this
599 * bucket to find it. */
600 static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
601 const NTDB_DATA *key, int ltype)
603 uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
605 return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
608 /* lock/unlock one hash chain. This is meant to be used to reduce
609 contention - it cannot guarantee how many records will be locked */
610 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
612 return chainlock(ntdb, &key, F_WRLCK);
615 _PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
617 uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
619 ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
622 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
625 return chainlock(ntdb, &key, F_RDLCK);
628 _PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
630 uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
632 ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);