ntdb: remove hash table trees.
[kai/samba-autobuild/.git] / lib / ntdb / hash.c
1  /*
2    Trivial Database 2: hash handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/hash/hash.h>
20
21 /* Default hash function. */
22 uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
23                           void *unused)
24 {
25         return hash_stable((const unsigned char *)key, length, seed);
26 }
27
28 uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
29 {
30         return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
31 }
32
33 static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
34                                  const struct ntdb_used_record *rec,
35                                  ntdb_off_t off,
36                                  const NTDB_DATA *key)
37 {
38         ntdb_bool_err ret = false;
39         const char *rkey;
40
41         if (rec_key_length(rec) != key->dsize) {
42                 ntdb->stats.compare_wrong_keylen++;
43                 return ret;
44         }
45
46         rkey = ntdb_access_read(ntdb, off + sizeof(*rec), key->dsize, false);
47         if (NTDB_PTR_IS_ERR(rkey)) {
48                 return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
49         }
50         if (memcmp(rkey, key->dptr, key->dsize) == 0)
51                 ret = true;
52         else
53                 ntdb->stats.compare_wrong_keycmp++;
54         ntdb_access_release(ntdb, rkey);
55         return ret;
56 }
57
58 /* Does entry match? */
59 static ntdb_bool_err match(struct ntdb_context *ntdb,
60                            uint32_t hash,
61                            const NTDB_DATA *key,
62                            ntdb_off_t val,
63                            struct ntdb_used_record *rec,
64                            const ntdb_off_t **mapped)
65 {
66         ntdb_off_t off;
67         enum NTDB_ERROR ecode;
68
69         ntdb->stats.compares++;
70
71         /* Top bits of offset == next bits of hash. */
72         if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
73             != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
74                 ntdb->stats.compare_wrong_offsetbits++;
75                 return false;
76         }
77
78         /* Unmap before we try to read actual record, which may cause expand */
79         if (mapped) {
80                 ntdb_access_release(ntdb, *mapped);
81                 *mapped = NULL;
82         }
83
84         off = val & NTDB_OFF_MASK;
85         ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
86         if (ecode != NTDB_SUCCESS) {
87                 return (ntdb_bool_err)ecode;
88         }
89
90         return key_matches(ntdb, rec, off, key);
91 }
92
93 static bool is_chain(ntdb_off_t val)
94 {
95         return val & (1ULL << NTDB_OFF_CHAIN_BIT);
96 }
97
98 static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
99 {
100         return base + sizeof(struct ntdb_used_record)
101                 + idx * sizeof(ntdb_off_t);
102 }
103
104 /* This is the core routine which searches the hashtable for an entry.
105  * On error, no locks are held and -ve is returned.
106  * Otherwise, hinfo is filled in.
107  * If not found, the return value is 0.
108  * If found, the return value is the offset, and *rec is the record. */
109 ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
110                         NTDB_DATA key,
111                         int ltype,
112                         struct hash_info *h,
113                         struct ntdb_used_record *rec)
114 {
115         ntdb_off_t off, val;
116         const ntdb_off_t *arr = NULL;
117         ntdb_len_t i;
118         bool found_empty;
119         enum NTDB_ERROR ecode;
120         struct ntdb_used_record chdr;
121         ntdb_bool_err berr;
122
123         h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
124
125         h->table = NTDB_HASH_OFFSET;
126         h->table_size = 1 << ntdb->hash_bits;
127         h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
128         h->old_val = 0;
129
130         ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
131         if (ecode != NTDB_SUCCESS) {
132                 return NTDB_ERR_TO_OFF(ecode);
133         }
134
135         off = hbucket_off(h->table, h->bucket);
136         val = ntdb_read_off(ntdb, off);
137         if (NTDB_OFF_IS_ERR(val)) {
138                 ecode = NTDB_OFF_TO_ERR(val);
139                 goto fail;
140         }
141
142         /* Directly in hash table? */
143         if (!is_chain(val)) {
144                 if (val) {
145                         berr = match(ntdb, h->h, &key, val, rec, NULL);
146                         if (berr < 0) {
147                                 ecode = NTDB_OFF_TO_ERR(berr);
148                                 goto fail;
149                         }
150                         if (berr) {
151                                 return val & NTDB_OFF_MASK;
152                         }
153                         /* If you want to insert here, make a chain. */
154                         h->old_val = val;
155                 }
156                 return 0;
157         }
158
159         /* Nope?  Iterate through chain. */
160         h->table = val & NTDB_OFF_MASK;
161
162         ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
163         if (ecode != NTDB_SUCCESS) {
164                 goto fail;
165         }
166
167         if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
168                 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
169                                     NTDB_LOG_ERROR,
170                                     "find_and_lock:"
171                                     " corrupt record %#x at %llu",
172                                     rec_magic(&chdr), (long long)off);
173                 goto fail;
174         }
175
176         h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
177
178         found_empty = false;
179         for (i = 0; i < h->table_size; i++) {
180                 /* Careful!  match has to unmap this if we access a
181                  * record (may cause mmap of database to move. */
182                 if (!arr) {
183                         arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
184                                                rec_data_length(&chdr), true);
185                         if (NTDB_PTR_IS_ERR(arr)) {
186                                 ecode = NTDB_PTR_ERR(arr);
187                                 goto fail;
188                         }
189                 }
190
191                 val = arr[i];
192                 if (val == 0) {
193                         if (!found_empty) {
194                                 h->bucket = i;
195                                 found_empty = true;
196                         }
197                 } else {
198                         berr = match(ntdb, h->h, &key, val, rec, &arr);
199                         if (berr < 0) {
200                                 ecode = NTDB_OFF_TO_ERR(berr);
201                                 if (arr) {
202                                         ntdb_access_release(ntdb, arr);
203                                 }
204                                 goto fail;
205                         }
206                         if (berr) {
207                                 /* We found it! */
208                                 h->bucket = i;
209                                 off = val & NTDB_OFF_MASK;
210                                 if (arr) {
211                                         ntdb_access_release(ntdb, arr);
212                                 }
213                                 return off;
214                         }
215                 }
216         }
217         if (!found_empty) {
218                 /* Set to any non-zero value */
219                 h->old_val = 1;
220                 h->bucket = i;
221         }
222
223         if (arr) {
224                 ntdb_access_release(ntdb, arr);
225         }
226         return 0;
227
228 fail:
229         ntdb_unlock_hash(ntdb, h->bucket, ltype);
230         return NTDB_ERR_TO_OFF(ecode);
231 }
232
233 static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
234                                 ntdb_off_t new_off, uint32_t hash)
235 {
236         ntdb_off_t extra;
237
238         assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
239         assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
240         /* We pack extra hash bits into the upper bits of the offset. */
241         extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
242         extra <<= (64 - NTDB_OFF_UPPER_STEAL);
243
244         return new_off | extra;
245 }
246
247 /* Simply overwrite the hash entry we found before. */
248 enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
249                                 const struct hash_info *h,
250                                 ntdb_off_t new_off)
251 {
252         return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
253                               encode_offset(ntdb, new_off, h->h));
254 }
255
256 enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
257                                  const struct hash_info *h)
258 {
259         return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
260 }
261
262
263 enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
264                             const struct hash_info *h,
265                             ntdb_off_t new_off)
266 {
267         enum NTDB_ERROR ecode;
268         ntdb_off_t chain;
269         struct ntdb_used_record chdr;
270         const ntdb_off_t *old;
271         ntdb_off_t *new;
272
273         /* We hit an empty bucket during search?  That's where it goes. */
274         if (!h->old_val) {
275                 return replace_in_hash(ntdb, h, new_off);
276         }
277
278         /* Full at top-level?  Create a 2-element chain. */
279         if (h->table == NTDB_HASH_OFFSET) {
280                 ntdb_off_t pair[2];
281
282                 /* One element is old value, the other is the new value. */
283                 pair[0] = h->old_val;
284                 pair[1] = encode_offset(ntdb, new_off, h->h);
285
286                 chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
287                 if (NTDB_OFF_IS_ERR(chain)) {
288                         return NTDB_OFF_TO_ERR(chain);
289                 }
290                 ecode = ntdb_write_convert(ntdb,
291                                            chain
292                                            + sizeof(struct ntdb_used_record),
293                                            pair, sizeof(pair));
294                 if (ecode == NTDB_SUCCESS) {
295                         ecode = ntdb_write_off(ntdb,
296                                                hbucket_off(h->table, h->bucket),
297                                                chain
298                                                | (1ULL << NTDB_OFF_CHAIN_BIT));
299                 }
300                 return ecode;
301         }
302
303         /* Full bucket.  Expand. */
304         ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
305         if (ecode != NTDB_SUCCESS) {
306                 return ecode;
307         }
308
309         if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
310                 /* Expand in place. */
311                 uint64_t dlen = rec_data_length(&chdr);
312
313                 ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
314                                    dlen + sizeof(new_off),
315                                    dlen + rec_extra_padding(&chdr));
316
317                 if (ecode != NTDB_SUCCESS) {
318                         return ecode;
319                 }
320                 /* find_and_lock set up h to point to last bucket. */
321                 ecode = replace_in_hash(ntdb, h, new_off);
322                 if (ecode != NTDB_SUCCESS) {
323                         return ecode;
324                 }
325                 ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
326                 if (ecode != NTDB_SUCCESS) {
327                         return ecode;
328                 }
329                 /* For futureproofing, we always make the first byte of padding
330                  * a zero. */
331                 if (rec_extra_padding(&chdr)) {
332                         ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
333                                                  + dlen + sizeof(new_off),
334                                                  "", 1);
335                 }
336                 return ecode;
337         }
338
339         /* We need to reallocate the chain. */
340         chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
341                       NTDB_CHAIN_MAGIC, true);
342         if (NTDB_OFF_IS_ERR(chain)) {
343                 return NTDB_OFF_TO_ERR(chain);
344         }
345
346         /* Map both and copy across old buckets. */
347         old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
348                                h->table_size*sizeof(ntdb_off_t), true);
349         if (NTDB_PTR_IS_ERR(old)) {
350                 return NTDB_PTR_ERR(old);
351         }
352         new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
353                                 (h->table_size + 1)*sizeof(ntdb_off_t), true);
354         if (NTDB_PTR_IS_ERR(new)) {
355                 ntdb_access_release(ntdb, old);
356                 return NTDB_PTR_ERR(new);
357         }
358
359         memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
360         new[h->bucket] = encode_offset(ntdb, new_off, h->h);
361         ntdb_access_release(ntdb, old);
362
363         ecode = ntdb_access_commit(ntdb, new);
364         if (ecode != NTDB_SUCCESS) {
365                 return ecode;
366         }
367
368         /* Free the old chain. */
369         ecode = add_free_record(ntdb, h->table,
370                                 sizeof(struct ntdb_used_record)
371                                 + rec_data_length(&chdr)
372                                 + rec_extra_padding(&chdr),
373                                 NTDB_LOCK_WAIT, true);
374
375         /* Replace top-level to point to new chain */
376         return ntdb_write_off(ntdb,
377                               hbucket_off(NTDB_HASH_OFFSET,
378                                           bits_from(h->h, 0, ntdb->hash_bits)),
379                               chain | (1ULL << NTDB_OFF_CHAIN_BIT));
380 }
381
382 /* Traverse support: returns offset of record, or 0 or -ve error. */
383 static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
384                                 ntdb_off_t val,
385                                 struct hash_info *h)
386 {
387         ntdb_off_t i;
388         enum NTDB_ERROR ecode;
389         struct ntdb_used_record chdr;
390
391         /* First load up chain header. */
392         h->table = val & NTDB_OFF_MASK;
393         ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
394         if (ecode != NTDB_SUCCESS) {
395                 return ecode;
396         }
397
398         if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
399                 return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
400                                    NTDB_LOG_ERROR,
401                                    "get_table:"
402                                    " corrupt record %#x at %llu",
403                                    rec_magic(&chdr),
404                                    (long long)h->table);
405         }
406
407         /* Chain length is implied by data length. */
408         h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
409
410         i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
411                                   h->table_size);
412         if (NTDB_OFF_IS_ERR(i)) {
413                 return i;
414         }
415
416         if (i != h->table_size) {
417                 /* Return to next bucket. */
418                 h->bucket = i + 1;
419                 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
420                 if (NTDB_OFF_IS_ERR(val)) {
421                         return val;
422                 }
423                 return val & NTDB_OFF_MASK;
424         }
425
426         /* Go back up to hash table. */
427         h->table = NTDB_HASH_OFFSET;
428         h->table_size = 1 << ntdb->hash_bits;
429         h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
430         return 0;
431 }
432
433 /* Keeps hash locked unless returns 0 or error. */
434 static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
435                                         struct hash_info *h)
436 {
437         ntdb_off_t val, i;
438         enum NTDB_ERROR ecode;
439
440         if (h->table != NTDB_HASH_OFFSET) {
441                 /* We're in a chain. */
442                 i = bits_from(h->h, 0, ntdb->hash_bits);
443                 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
444                 if (ecode != NTDB_SUCCESS) {
445                         return NTDB_ERR_TO_OFF(ecode);
446                 }
447
448                 /* We dropped lock, bucket might have moved! */
449                 val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
450                 if (NTDB_OFF_IS_ERR(val)) {
451                         goto unlock;
452                 }
453
454                 /* We don't remove chains: there should still be one there! */
455                 if (!val || !is_chain(val)) {
456                         ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
457                                             NTDB_LOG_ERROR,
458                                             "iterate_hash:"
459                                             " vanished hchain %llu at %llu",
460                                             (long long)val,
461                                             (long long)i);
462                         val = NTDB_ERR_TO_OFF(ecode);
463                         goto unlock;
464                 }
465
466                 /* Find next bucket in the chain. */
467                 val = iterate_chain(ntdb, val, h);
468                 if (NTDB_OFF_IS_ERR(val)) {
469                         goto unlock;
470                 }
471                 if (val != 0) {
472                         return val;
473                 }
474                 ntdb_unlock_hash(ntdb, i, F_RDLCK);
475
476                 /* OK, we've reset h back to top level. */
477         }
478
479         /* We do this unlocked, then re-check. */
480         for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
481                                        h->bucket, h->table_size);
482              i != h->table_size;
483              i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
484                                        i+1, h->table_size)) {
485                 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
486                 if (ecode != NTDB_SUCCESS) {
487                         return NTDB_ERR_TO_OFF(ecode);
488                 }
489
490                 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
491                 if (NTDB_OFF_IS_ERR(val)) {
492                         goto unlock;
493                 }
494
495                 /* Lost race, and it's empty? */
496                 if (!val) {
497                         ntdb->stats.traverse_val_vanished++;
498                         ntdb_unlock_hash(ntdb, i, F_RDLCK);
499                         continue;
500                 }
501
502                 if (!is_chain(val)) {
503                         /* So caller knows what lock to free. */
504                         h->h = i;
505                         /* Return to next bucket. */
506                         h->bucket = i + 1;
507                         val &= NTDB_OFF_MASK;
508                         return val;
509                 }
510
511                 /* Start at beginning of chain */
512                 h->bucket = 0;
513                 h->h = i;
514
515                 val = iterate_chain(ntdb, val, h);
516                 if (NTDB_OFF_IS_ERR(val)) {
517                         goto unlock;
518                 }
519                 if (val != 0) {
520                         return val;
521                 }
522
523                 /* Otherwise, bucket has been set to i+1 */
524                 ntdb_unlock_hash(ntdb, i, F_RDLCK);
525         }
526         return 0;
527
528 unlock:
529         ntdb_unlock_hash(ntdb, i, F_RDLCK);
530         return val;
531 }
532
533 /* Return success if we find something, NTDB_ERR_NOEXIST if none. */
534 enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
535                              struct hash_info *h,
536                              NTDB_DATA *kbuf, size_t *dlen)
537 {
538         ntdb_off_t off;
539         struct ntdb_used_record rec;
540         enum NTDB_ERROR ecode;
541
542         off = lock_and_iterate_hash(ntdb, h);
543
544         if (NTDB_OFF_IS_ERR(off)) {
545                 return NTDB_OFF_TO_ERR(off);
546         } else if (off == 0) {
547                 return NTDB_ERR_NOEXIST;
548         }
549
550         /* The hash for this key is still locked. */
551         ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
552         if (ecode != NTDB_SUCCESS) {
553                 goto unlock;
554         }
555         if (rec_magic(&rec) != NTDB_USED_MAGIC) {
556                 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
557                                     NTDB_LOG_ERROR,
558                                     "next_in_hash:"
559                                     " corrupt record at %llu",
560                                     (long long)off);
561                 goto unlock;
562         }
563
564         kbuf->dsize = rec_key_length(&rec);
565
566         /* They want data as well? */
567         if (dlen) {
568                 *dlen = rec_data_length(&rec);
569                 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
570                                              kbuf->dsize + *dlen);
571         } else {
572                 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
573                                              kbuf->dsize);
574         }
575         if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
576                 ecode = NTDB_PTR_ERR(kbuf->dptr);
577                 goto unlock;
578         }
579         ecode = NTDB_SUCCESS;
580
581 unlock:
582         ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
583         return ecode;
584
585 }
586
587 enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
588                              struct hash_info *h,
589                              NTDB_DATA *kbuf, size_t *dlen)
590 {
591         h->table = NTDB_HASH_OFFSET;
592         h->table_size = 1 << ntdb->hash_bits;
593         h->bucket = 0;
594
595         return next_in_hash(ntdb, h, kbuf, dlen);
596 }
597
598 /* Even if the entry isn't in this hash bucket, you'd have to lock this
599  * bucket to find it. */
600 static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
601                                  const NTDB_DATA *key, int ltype)
602 {
603         uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
604
605         return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
606 }
607
608 /* lock/unlock one hash chain. This is meant to be used to reduce
609    contention - it cannot guarantee how many records will be locked */
610 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
611 {
612         return chainlock(ntdb, &key, F_WRLCK);
613 }
614
615 _PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
616 {
617         uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
618
619         ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
620 }
621
622 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
623                                              NTDB_DATA key)
624 {
625         return chainlock(ntdb, &key, F_RDLCK);
626 }
627
628 _PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
629 {
630         uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
631
632         ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
633 }