lib/tdb/common/check.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Rusty Russell             2009
   7
   8      ** NOTE! The following LGPL license applies to the tdb
   9      ** library. This does NOT imply that all of Samba is released
  10      ** under the LGPL
  11
  12    This library is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU Lesser General Public
  14    License as published by the Free Software Foundation; either
  15    version 3 of the License, or (at your option) any later version.
  16
  17    This library is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  24 */
  25 #include "tdb_private.h"
  26
  27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
  28 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
  29 {
  30         struct tdb_header hdr;
  31
  32         if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), DOCONV()) == -1)
  33                 return false;
  34         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
  35                 goto corrupt;
  36
  37         CONVERT(hdr);
  38         if (hdr.version != TDB_VERSION)
  39                 goto corrupt;
  40
  41         if (hdr.rwlocks != 0)
  42                 goto corrupt;
  43
  44         if (hdr.hash_size == 0)
  45                 goto corrupt;
  46
  47         if (hdr.hash_size != tdb->header.hash_size)
  48                 goto corrupt;
  49
  50         if (hdr.recovery_start != 0 &&
  51             hdr.recovery_start < TDB_DATA_START(tdb->header.hash_size))
  52                 goto corrupt;
  53
  54         *recovery = hdr.recovery_start;
  55         return true;
  56
  57 corrupt:
  58         tdb->ecode = TDB_ERR_CORRUPT;
  59         TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
  60         return false;
  61 }
  62
  63 /* Generic record header check. */
  64 static bool tdb_check_record(struct tdb_context *tdb,
  65                              tdb_off_t off,
  66                              const struct tdb_record *rec)
  67 {
  68         tdb_off_t tailer;
  69
  70         /* Check rec->next: 0 or points to record offset, aligned. */
  71         if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size)){
  72                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
  73                          "Record offset %d too small next %d\n",
  74                          off, rec->next));
  75                 goto corrupt;
  76         }
  77         if (rec->next + sizeof(*rec) < rec->next) {
  78                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
  79                          "Record offset %d too large next %d\n",
  80                          off, rec->next));
  81                 goto corrupt;
  82         }
  83         if ((rec->next % TDB_ALIGNMENT) != 0) {
  84                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
  85                          "Record offset %d misaligned next %d\n",
  86                          off, rec->next));
  87                 goto corrupt;
  88         }
  89         if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0))
  90                 goto corrupt;
  91
  92         /* Check rec_len: similar to rec->next, implies next record. */
  93         if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
  94                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
  95                          "Record offset %d misaligned length %d\n",
  96                          off, rec->rec_len));
  97                 goto corrupt;
  98         }
  99         /* Must fit tailer. */
 100         if (rec->rec_len < sizeof(tailer)) {
 101                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 102                          "Record offset %d too short length %d\n",
 103                          off, rec->rec_len));
 104                 goto corrupt;
 105         }
 106         /* OOB allows "right at the end" access, so this works for last rec. */
 107         if (tdb->methods->tdb_oob(tdb, off+sizeof(*rec)+rec->rec_len, 0))
 108                 goto corrupt;
 109
 110         /* Check tailer. */
 111         if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
 112                          &tailer) == -1)
 113                 goto corrupt;
 114         if (tailer != sizeof(*rec) + rec->rec_len) {
 115                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 116                          "Record offset %d invalid tailer\n", off));
 117                 goto corrupt;
 118         }
 119
 120         return true;
 121
 122 corrupt:
 123         tdb->ecode = TDB_ERR_CORRUPT;
 124         return false;
 125 }
 126
 127 /* Grab some bytes: may copy if can't use mmap.
 128    Caller has already done bounds check. */
 129 static TDB_DATA get_bytes(struct tdb_context *tdb,
 130                           tdb_off_t off, tdb_len_t len)
 131 {
 132         TDB_DATA d;
 133
 134         d.dsize = len;
 135
 136         if (tdb->transaction == NULL && tdb->map_ptr != NULL)
 137                 d.dptr = (unsigned char *)tdb->map_ptr + off;
 138         else
 139                 d.dptr = tdb_alloc_read(tdb, off, d.dsize);
 140         return d;
 141 }
 142
 143 /* Frees data if we're not able to simply use mmap. */
 144 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
 145 {
 146         if (tdb->transaction == NULL && tdb->map_ptr != NULL)
 147                 return;
 148         free(d.dptr);
 149 }
 150
 151 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
 152  * See: http://burtleburtle.net/bob/c/lookup3.c
 153  */
 154 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 155 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
 156 {
 157         uint32_t a,b,c;
 158
 159         /* Set up the internal state */
 160         a = b = c = 0xdeadbeef + *pc;
 161         c += *pb;
 162         a += key;
 163         c ^= b; c -= rot(b,14);
 164         a ^= c; a -= rot(c,11);
 165         b ^= a; b -= rot(a,25);
 166         c ^= b; c -= rot(b,16);
 167         a ^= c; a -= rot(c,4);
 168         b ^= a; b -= rot(a,14);
 169         c ^= b; c -= rot(b,24);
 170         *pc=c; *pb=b;
 171 }
 172
 173 /*
 174   We want to check that all free records are in the free list
 175   (only once), and all free list entries are free records.  Similarly
 176   for each hash chain of used records.
 177
 178   Doing that naively (without walking hash chains, since we want to be
 179   linear) means keeping a list of records which have been seen in each
 180   hash chain, and another of records pointed to (ie. next pointers
 181   from records and the initial hash chain heads).  These two lists
 182   should be equal.  This will take 8 bytes per record, and require
 183   sorting at the end.
 184
 185   So instead, we record each offset in a bitmap such a way that
 186   recording it twice will cancel out.  Since each offset should appear
 187   exactly twice, the bitmap should be zero at the end.
 188
 189   The approach was inspired by Bloom Filters (see Wikipedia).  For
 190   each value, we flip K bits in a bitmap of size N.  The number of
 191   distinct arrangements is:
 192
 193         N! / (K! * (N-K)!)
 194
 195   Of course, not all arrangements are actually distinct, but testing
 196   shows this formula to be close enough.
 197
 198   So, if K == 8 and N == 256, the probability of two things flipping the same
 199   bits is 1 in 409,663,695,276,000.
 200
 201   Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
 202   (320k) seems reasonable.
 203 */
 204 #define NUM_HASHES 8
 205 #define BITMAP_BITS 256
 206
 207 static void bit_flip(unsigned char bits[], unsigned int idx)
 208 {
 209         bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
 210 }
 211
 212 /* We record offsets in a bitmap for the particular chain it should be in.  */
 213 static void record_offset(unsigned char bits[], tdb_off_t off)
 214 {
 215         uint32_t h1 = off, h2 = 0;
 216         unsigned int i;
 217
 218         /* We get two good hash values out of jhash2, so we use both.  Then
 219          * we keep going to produce further hash values. */
 220         for (i = 0; i < NUM_HASHES / 2; i++) {
 221                 hash(off, &h1, &h2);
 222                 bit_flip(bits, h1 % BITMAP_BITS);
 223                 bit_flip(bits, h2 % BITMAP_BITS);
 224                 h2++;
 225         }
 226 }
 227
 228 /* Check that an in-use record is valid. */
 229 static bool tdb_check_used_record(struct tdb_context *tdb,
 230                                   tdb_off_t off,
 231                                   const struct tdb_record *rec,
 232                                   unsigned char **hashes,
 233                                   int (*check)(TDB_DATA, TDB_DATA, void *),
 234                                   void *private_data)
 235 {
 236         TDB_DATA key, data;
 237
 238         if (!tdb_check_record(tdb, off, rec))
 239                 return false;
 240
 241         /* key + data + tailer must fit in record */
 242         if (rec->key_len + rec->data_len + sizeof(tdb_off_t) > rec->rec_len) {
 243                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 244                          "Record offset %d too short for contents\n", off));
 245                 return false;
 246         }
 247
 248         key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
 249         if (!key.dptr)
 250                 return false;
 251
 252         if (tdb->hash_fn(&key) != rec->full_hash) {
 253                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 254                          "Record offset %d has incorrect hash\n", off));
 255                 goto fail_put_key;
 256         }
 257
 258         /* Mark this offset as a known value for this hash bucket. */
 259         record_offset(hashes[BUCKET(rec->full_hash)+1], off);
 260         /* And similarly if the next pointer is valid. */
 261         if (rec->next)
 262                 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
 263
 264         /* If they supply a check function and this record isn't dead,
 265            get data and feed it. */
 266         if (check && rec->magic != TDB_DEAD_MAGIC) {
 267                 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
 268                                  rec->data_len);
 269                 if (!data.dptr)
 270                         goto fail_put_key;
 271
 272                 if (check(key, data, private_data) == -1)
 273                         goto fail_put_data;
 274                 put_bytes(tdb, data);
 275         }
 276
 277         put_bytes(tdb, key);
 278         return true;
 279
 280 fail_put_data:
 281         put_bytes(tdb, data);
 282 fail_put_key:
 283         put_bytes(tdb, key);
 284         return false;
 285 }
 286
 287 /* Check that an unused record is valid. */
 288 static bool tdb_check_free_record(struct tdb_context *tdb,
 289                                   tdb_off_t off,
 290                                   const struct tdb_record *rec,
 291                                   unsigned char **hashes)
 292 {
 293         if (!tdb_check_record(tdb, off, rec))
 294                 return false;
 295
 296         /* Mark this offset as a known value for the free list. */
 297         record_offset(hashes[0], off);
 298         /* And similarly if the next pointer is valid. */
 299         if (rec->next)
 300                 record_offset(hashes[0], rec->next);
 301         return true;
 302 }
 303
 304 /* Slow, but should be very rare. */
 305 static size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
 306 {
 307         size_t len;
 308
 309         for (len = 0; off + len < tdb->map_size; len++) {
 310                 char c;
 311                 if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
 312                         return 0;
 313                 if (c != 0 && c != 0x42)
 314                         break;
 315         }
 316         return len;
 317 }
 318
 319 int tdb_check(struct tdb_context *tdb,
 320               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 321               void *private_data)
 322 {
 323         unsigned int h;
 324         unsigned char **hashes;
 325         tdb_off_t off, recovery_start;
 326         struct tdb_record rec;
 327         bool found_recovery = false;
 328         tdb_len_t dead;
 329
 330         if (tdb_lockall_read(tdb) == -1)
 331                 return -1;
 332
 333         /* Make sure we know true size of the underlying file. */
 334         tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
 335
 336         /* Header must be OK: also gets us the recovery ptr, if any. */
 337         if (!tdb_check_header(tdb, &recovery_start))
 338                 goto unlock;
 339
 340         /* We should have the whole header, too. */
 341         if (tdb->map_size < TDB_DATA_START(tdb->header.hash_size)) {
 342                 tdb->ecode = TDB_ERR_CORRUPT;
 343                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
 344                 goto unlock;
 345         }
 346
 347         /* One big malloc: pointers then bit arrays. */
 348         hashes = (unsigned char **)calloc(
 349                         1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
 350                         + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
 351         if (!hashes) {
 352                 tdb->ecode = TDB_ERR_OOM;
 353                 goto unlock;
 354         }
 355
 356         /* Initialize pointers */
 357         hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
 358         for (h = 1; h < 1+tdb->header.hash_size; h++)
 359                 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
 360
 361         /* Freelist and hash headers are all in a row: read them. */
 362         for (h = 0; h < 1+tdb->header.hash_size; h++) {
 363                 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
 364                                  &off) == -1)
 365                         goto free;
 366                 if (off)
 367                         record_offset(hashes[h], off);
 368         }
 369
 370         /* For each record, read it in and check it's ok. */
 371         for (off = TDB_DATA_START(tdb->header.hash_size);
 372              off < tdb->map_size;
 373              off += sizeof(rec) + rec.rec_len) {
 374                 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
 375                                            DOCONV()) == -1)
 376                         goto free;
 377                 switch (rec.magic) {
 378                 case TDB_MAGIC:
 379                 case TDB_DEAD_MAGIC:
 380                         if (!tdb_check_used_record(tdb, off, &rec, hashes,
 381                                                    check, private_data))
 382                                 goto free;
 383                         break;
 384                 case TDB_FREE_MAGIC:
 385                         if (!tdb_check_free_record(tdb, off, &rec, hashes))
 386                                 goto free;
 387                         break;
 388                 /* If we crash after ftruncate, we can get zeroes or fill. */
 389                 case TDB_RECOVERY_INVALID_MAGIC:
 390                 case 0x42424242:
 391                         if (recovery_start == off) {
 392                                 found_recovery = true;
 393                                 break;
 394                         }
 395                         dead = dead_space(tdb, off);
 396                         if (dead < sizeof(rec))
 397                                 goto corrupt;
 398
 399                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 400                                  "Dead space at %d-%d (of %u)\n",
 401                                  off, off + dead, tdb->map_size));
 402                         rec.rec_len = dead - sizeof(rec);
 403                         break;
 404                 case TDB_RECOVERY_MAGIC:
 405                         if (recovery_start != off) {
 406                                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 407                                          "Unexpected recovery record at offset %d\n",
 408                                          off));
 409                                 goto free;
 410                         }
 411                         found_recovery = true;
 412                         break;
 413                 default: ;
 414                 corrupt:
 415                         tdb->ecode = TDB_ERR_CORRUPT;
 416                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 417                                  "Bad magic 0x%x at offset %d\n",
 418                                  rec.magic, off));
 419                         goto free;
 420                 }
 421         }
 422
 423         /* Now, hashes should all be empty: each record exists and is referred
 424          * to by one other. */
 425         for (h = 0; h < 1+tdb->header.hash_size; h++) {
 426                 unsigned int i;
 427                 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
 428                         if (hashes[h][i] != 0) {
 429                                 tdb->ecode = TDB_ERR_CORRUPT;
 430                                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 431                                          "Hashes do not match records\n"));
 432                                 goto free;
 433                         }
 434                 }
 435         }
 436
 437         /* We must have found recovery area if there was one. */
 438         if (recovery_start != 0 && !found_recovery) {
 439                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 440                          "Expected a recovery area at %u\n",
 441                          recovery_start));
 442                 goto free;
 443         }
 444
 445         free(hashes);
 446         tdb_unlockall_read(tdb);
 447         return 0;
 448
 449 free:
 450         free(hashes);
 451 unlock:
 452         tdb_unlockall_read(tdb);
 453         return -1;
 454 }