tdb: add tdb_rescue()
[samba.git] / lib / tdb / common / rescue.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library, rescue attempt code.
5
6    Copyright (C) Rusty Russell             2012
7
8      ** NOTE! The following LGPL license applies to the tdb
9      ** library. This does NOT imply that all of Samba is released
10      ** under the LGPL
11
12    This library is free software; you can redistribute it and/or
13    modify it under the terms of the GNU Lesser General Public
14    License as published by the Free Software Foundation; either
15    version 3 of the License, or (at your option) any later version.
16
17    This library is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20    Lesser General Public License for more details.
21
22    You should have received a copy of the GNU Lesser General Public
23    License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 */
25 #include "tdb_private.h"
26 #include <assert.h>
27
28
29 struct found {
30         tdb_off_t head; /* 0 -> invalid. */
31         struct tdb_record rec;
32         TDB_DATA key;
33         bool in_hash;
34         bool in_free;
35 };
36
37 struct found_table {
38         /* As an ordered array (by head offset). */
39         struct found *arr;
40         unsigned int num, max;
41 };
42
43 static bool looks_like_valid_record(struct tdb_context *tdb,
44                                     tdb_off_t off,
45                                     const struct tdb_record *rec,
46                                     TDB_DATA *key)
47 {
48         unsigned int hval;
49
50         if (rec->magic != TDB_MAGIC)
51                 return false;
52
53         if (rec->key_len + rec->data_len > rec->rec_len)
54                 return false;
55
56         if (rec->rec_len % TDB_ALIGNMENT)
57                 return false;
58
59         /* Next pointer must make some sense. */
60         if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size))
61                 return false;
62
63         if (tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 1))
64                 return false;
65
66         key->dsize = rec->key_len;
67         key->dptr = tdb_alloc_read(tdb, off + sizeof(*rec), key->dsize);
68         if (!key->dptr)
69                 return false;
70
71         hval = tdb->hash_fn(key);
72         if (hval != rec->full_hash) {
73                 free(key->dptr);
74                 return false;
75         }
76
77         /* Caller frees up key->dptr */
78         return true;
79 }
80
81 static bool add_to_table(struct found_table *found,
82                          tdb_off_t off,
83                          struct tdb_record *rec,
84                          TDB_DATA key)
85 {
86         if (found->num + 1 > found->max) {
87                 struct found *new;
88                 found->max = (found->max ? found->max * 2 : 128);
89                 new = realloc(found->arr, found->max * sizeof(found->arr[0]));
90                 if (!new)
91                         return false;
92                 found->arr = new;
93         }
94
95         found->arr[found->num].head = off;
96         found->arr[found->num].rec = *rec;
97         found->arr[found->num].key = key;
98         found->arr[found->num].in_hash = false;
99         found->arr[found->num].in_free = false;
100
101         found->num++;
102         return true;
103 }
104
105 static bool walk_record(struct tdb_context *tdb,
106                         const struct found *f,
107                         void (*walk)(TDB_DATA, TDB_DATA, void *private_data),
108                         void *private_data)
109 {
110         TDB_DATA data;
111
112         data.dsize = f->rec.data_len;
113         data.dptr = tdb_alloc_read(tdb,
114                                    f->head + sizeof(f->rec) + f->rec.key_len,
115                                    data.dsize);
116         if (!data.dptr) {
117                 if (tdb->ecode == TDB_ERR_OOM)
118                         return false;
119                 /* I/O errors are expected. */
120                 return true;
121         }
122
123         walk(f->key, data, private_data);
124         free(data.dptr);
125         return true;
126 }
127
128 /* First entry which has offset >= this one. */
129 static unsigned int find_entry(struct found_table *found, tdb_off_t off)
130 {
131         unsigned int start = 0, end = found->num;
132
133         while (start < end) {
134                 /* We can't overflow here. */
135                 unsigned int mid = (start + end) / 2;
136
137                 if (off < found->arr[mid].head) {
138                         end = mid;
139                 } else if (off > found->arr[mid].head) {
140                         start = mid + 1;
141                 } else {
142                         return mid;
143                 }
144         }
145
146         assert(start == end);
147         return end;
148 }
149
150 static void found_in_hashchain(struct found_table *found, tdb_off_t head)
151 {
152         unsigned int match;
153
154         match = find_entry(found, head);
155         if (match < found->num && found->arr[match].head == head) {
156                 found->arr[match].in_hash = true;
157         }
158 }
159
160 static void mark_free_area(struct found_table *found, tdb_off_t head,
161                            tdb_len_t len)
162 {
163         unsigned int match;
164
165         match = find_entry(found, head);
166         /* Mark everything within this free entry. */
167         while (match < found->num) {
168                 if (found->arr[match].head >= head + len) {
169                         break;
170                 }
171                 found->arr[match].in_free = true;
172                 match++;
173         }
174 }
175
176 static int cmp_key(const void *a, const void *b)
177 {
178         const struct found *fa = a, *fb = b;
179
180         if (fa->key.dsize < fb->key.dsize) {
181                 return -1;
182         } else if (fa->key.dsize > fb->key.dsize) {
183                 return 1;
184         }
185         return memcmp(fa->key.dptr, fb->key.dptr, fa->key.dsize);
186 }
187
188 static bool key_eq(TDB_DATA a, TDB_DATA b)
189 {
190         return a.dsize == b.dsize
191                 && memcmp(a.dptr, b.dptr, a.dsize) == 0;
192 }
193
194 static void free_table(struct found_table *found)
195 {
196         unsigned int i;
197
198         for (i = 0; i < found->num; i++) {
199                 free(found->arr[i].key.dptr);
200         }
201         free(found->arr);
202 }
203
204 static void logging_suppressed(struct tdb_context *tdb,
205                                enum tdb_debug_level level, const char *fmt, ...)
206 {
207 }
208
209 _PUBLIC_ int tdb_rescue(struct tdb_context *tdb,
210                         void (*walk)(TDB_DATA, TDB_DATA, void *private_data),
211                         void *private_data)
212 {
213         struct found_table found = { NULL, 0, 0 };
214         tdb_off_t h, off, i;
215         tdb_log_func oldlog = tdb->log.log_fn;
216         struct tdb_record rec;
217         TDB_DATA key;
218         bool locked;
219
220         /* Read-only databases use no locking at all: it's best-effort.
221          * We may have a write lock already, so skip that case too. */
222         if (tdb->read_only || tdb->allrecord_lock.count != 0) {
223                 locked = false;
224         } else {
225                 if (tdb_lockall_read(tdb) == -1)
226                         return -1;
227                 locked = true;
228         }
229
230         /* Make sure we know true size of the underlying file. */
231         tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
232
233         /* Suppress logging, since we anticipate errors. */
234         tdb->log.log_fn = logging_suppressed;
235
236         /* Now walk entire db looking for records. */
237         for (off = TDB_DATA_START(tdb->header.hash_size);
238              off < tdb->map_size;
239              off += TDB_ALIGNMENT) {
240                 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
241                                            DOCONV()) == -1)
242                         continue;
243
244                 if (looks_like_valid_record(tdb, off, &rec, &key)) {
245                         if (!add_to_table(&found, off, &rec, key)) {
246                                 goto oom;
247                         }
248                 }
249         }
250
251         /* Walk hash chains to positive vet. */
252         for (h = 0; h < 1+tdb->header.hash_size; h++) {
253                 bool slow_chase = false;
254                 tdb_off_t slow_off = FREELIST_TOP + h*sizeof(tdb_off_t);
255
256                 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
257                                  &off) == -1)
258                         continue;
259
260                 while (off && off != slow_off) {
261                         if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
262                                                    DOCONV()) != 0) {
263                                 break;
264                         }
265
266                         /* 0 is the free list, rest are hash chains. */
267                         if (h == 0) {
268                                 /* Don't mark garbage as free. */
269                                 if (rec.magic != TDB_FREE_MAGIC) {
270                                         break;
271                                 }
272                                 mark_free_area(&found, off,
273                                                sizeof(rec) + rec.rec_len);
274                         } else {
275                                 found_in_hashchain(&found, off);
276                         }
277
278                         off = rec.next;
279
280                         /* Loop detection using second pointer at half-speed */
281                         if (slow_chase) {
282                                 /* First entry happens to be next ptr */
283                                 tdb_ofs_read(tdb, slow_off, &slow_off);
284                         }
285                         slow_chase = !slow_chase;
286                 }
287         }
288
289         /* Recovery area: must be marked as free, since it often has old
290          * records in there! */
291         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &off) == 0 && off != 0) {
292                 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
293                                            DOCONV()) == 0) {
294                         mark_free_area(&found, off, sizeof(rec) + rec.rec_len);
295                 }
296         }
297
298         /* Now sort by key! */
299         qsort(found.arr, found.num, sizeof(found.arr[0]), cmp_key);
300
301         for (i = 0; i < found.num; ) {
302                 unsigned int num, num_in_hash = 0;
303
304                 /* How many are identical? */
305                 for (num = 0; num < found.num - i; num++) {
306                         if (!key_eq(found.arr[i].key, found.arr[i+num].key)) {
307                                 break;
308                         }
309                         if (found.arr[i+num].in_hash) {
310                                 if (!walk_record(tdb, &found.arr[i+num],
311                                                  walk, private_data))
312                                         goto oom;
313                                 num_in_hash++;
314                         }
315                 }
316                 assert(num);
317
318                 /* If none were in the hash, we print any not in free list. */
319                 if (num_in_hash == 0) {
320                         unsigned int j;
321
322                         for (j = i; j < i + num; j++) {
323                                 if (!found.arr[j].in_free) {
324                                         if (!walk_record(tdb, &found.arr[j],
325                                                          walk, private_data))
326                                                 goto oom;
327                                 }
328                         }
329                 }
330
331                 i += num;
332         }
333
334         tdb->log.log_fn = oldlog;
335         if (locked) {
336                 tdb_unlockall_read(tdb);
337         }
338         return 0;
339
340 oom:
341         tdb->log.log_fn = oldlog;
342         tdb->ecode = TDB_ERR_OOM;
343         TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_rescue: failed allocating\n"));
344         free_table(&found);
345         if (locked) {
346                 tdb_unlockall_read(tdb);
347         }
348         return -1;
349 }