tdb: introduce tdb->hdr_ofs
[samba.git] / lib / tdb / common / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9
10      ** NOTE! The following LGPL license applies to the tdb
11      ** library. This does NOT imply that all of Samba is released
12      ** under the LGPL
13
14    This library is free software; you can redistribute it and/or
15    modify it under the terms of the GNU Lesser General Public
16    License as published by the Free Software Foundation; either
17    version 3 of the License, or (at your option) any later version.
18
19    This library is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    Lesser General Public License for more details.
23
24    You should have received a copy of the GNU Lesser General Public
25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28
29 #include "tdb_private.h"
30
31 /*
32  * tdb->hdr_ofs is 0 for now.
33  *
34  * Note: that we only have the 4GB limit of tdb_off_t for
35  * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
36  */
37
38 static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
39 {
40         off_t tmp = tdb->hdr_ofs + *off;
41
42         if ((tmp < tdb->hdr_ofs) || (tmp < *off)) {
43                 errno = EIO;
44                 return false;
45         }
46
47         *off = tmp;
48         return true;
49 }
50
51 static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
52                           size_t count, off_t offset)
53 {
54         if (!tdb_adjust_offset(tdb, &offset)) {
55                 return -1;
56         }
57         return pwrite(tdb->fd, buf, count, offset);
58 }
59
60 static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
61                          size_t count, off_t offset)
62 {
63         if (!tdb_adjust_offset(tdb, &offset)) {
64                 return -1;
65         }
66         return pread(tdb->fd, buf, count, offset);
67 }
68
69 static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
70 {
71         if (!tdb_adjust_offset(tdb, &length)) {
72                 return -1;
73         }
74         return ftruncate(tdb->fd, length);
75 }
76
77 static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
78 {
79         int ret;
80
81         ret = fstat(tdb->fd, buf);
82         if (ret == -1) {
83                 return -1;
84         }
85
86         if (buf->st_size < tdb->hdr_ofs) {
87                 errno = EIO;
88                 return -1;
89         }
90         buf->st_size -= tdb->hdr_ofs;
91
92         return ret;
93 }
94
95 /* check for an out of bounds access - if it is out of bounds then
96    see if the database has been expanded by someone else and expand
97    if necessary
98 */
99 static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len,
100                    int probe)
101 {
102         struct stat st;
103         if (len + off < len) {
104                 if (!probe) {
105                         /* Ensure ecode is set for log fn. */
106                         tdb->ecode = TDB_ERR_IO;
107                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %u len %u wrap\n",
108                                  off, len));
109                 }
110                 return -1;
111         }
112
113         if (off + len <= tdb->map_size)
114                 return 0;
115         if (tdb->flags & TDB_INTERNAL) {
116                 if (!probe) {
117                         /* Ensure ecode is set for log fn. */
118                         tdb->ecode = TDB_ERR_IO;
119                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond internal malloc size %u\n",
120                                  (int)(off + len), (int)tdb->map_size));
121                 }
122                 return -1;
123         }
124
125         if (tdb_fstat(tdb, &st) == -1) {
126                 tdb->ecode = TDB_ERR_IO;
127                 return -1;
128         }
129
130         /* Beware >4G files! */
131         if ((tdb_off_t)st.st_size != st.st_size) {
132                 /* Ensure ecode is set for log fn. */
133                 tdb->ecode = TDB_ERR_IO;
134                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_oob len %llu too large!\n",
135                          (long long)st.st_size));
136                 return -1;
137         }
138
139         /* Unmap, update size, remap.  We do this unconditionally, to handle
140          * the unusual case where the db is truncated.
141          *
142          * This can happen to a child using tdb_reopen_all(true) on a
143          * TDB_CLEAR_IF_FIRST tdb whose parent crashes: the next
144          * opener will truncate the database. */
145         if (tdb_munmap(tdb) == -1) {
146                 tdb->ecode = TDB_ERR_IO;
147                 return -1;
148         }
149         tdb->map_size = st.st_size;
150         if (tdb_mmap(tdb) != 0) {
151                 return -1;
152         }
153
154         if (st.st_size < (size_t)off + len) {
155                 if (!probe) {
156                         /* Ensure ecode is set for log fn. */
157                         tdb->ecode = TDB_ERR_IO;
158                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n",
159                                  (int)(off + len), (int)st.st_size));
160                 }
161                 return -1;
162         }
163         return 0;
164 }
165
166 /* write a lump of data at a specified offset */
167 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
168                      const void *buf, tdb_len_t len)
169 {
170         if (len == 0) {
171                 return 0;
172         }
173
174         if (tdb->read_only || tdb->traverse_read) {
175                 tdb->ecode = TDB_ERR_RDONLY;
176                 return -1;
177         }
178
179         if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0)
180                 return -1;
181
182         if (tdb->map_ptr) {
183                 memcpy(off + (char *)tdb->map_ptr, buf, len);
184         } else {
185 #ifdef HAVE_INCOHERENT_MMAP
186                 tdb->ecode = TDB_ERR_IO;
187                 return -1;
188 #else
189                 ssize_t written;
190
191                 written = tdb_pwrite(tdb, buf, len, off);
192
193                 if ((written != (ssize_t)len) && (written != -1)) {
194                         /* try once more */
195                         tdb->ecode = TDB_ERR_IO;
196                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
197                                  "%zi of %u bytes at %u, trying once more\n",
198                                  written, len, off));
199                         written = tdb_pwrite(tdb, (const char *)buf+written,
200                                              len-written, off+written);
201                 }
202                 if (written == -1) {
203                         /* Ensure ecode is set for log fn. */
204                         tdb->ecode = TDB_ERR_IO;
205                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %u "
206                                  "len=%u (%s)\n", off, len, strerror(errno)));
207                         return -1;
208                 } else if (written != (ssize_t)len) {
209                         tdb->ecode = TDB_ERR_IO;
210                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
211                                  "write %u bytes at %u in two attempts\n",
212                                  len, off));
213                         return -1;
214                 }
215 #endif
216         }
217         return 0;
218 }
219
220 /* Endian conversion: we only ever deal with 4 byte quantities */
221 void *tdb_convert(void *buf, uint32_t size)
222 {
223         uint32_t i, *p = (uint32_t *)buf;
224         for (i = 0; i < size / 4; i++)
225                 p[i] = TDB_BYTEREV(p[i]);
226         return buf;
227 }
228
229
230 /* read a lump of data at a specified offset, maybe convert */
231 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
232                     tdb_len_t len, int cv)
233 {
234         if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0) {
235                 return -1;
236         }
237
238         if (tdb->map_ptr) {
239                 memcpy(buf, off + (char *)tdb->map_ptr, len);
240         } else {
241 #ifdef HAVE_INCOHERENT_MMAP
242                 tdb->ecode = TDB_ERR_IO;
243                 return -1;
244 #else
245                 ssize_t ret;
246
247                 ret = tdb_pread(tdb, buf, len, off);
248                 if (ret != (ssize_t)len) {
249                         /* Ensure ecode is set for log fn. */
250                         tdb->ecode = TDB_ERR_IO;
251                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %u "
252                                  "len=%u ret=%zi (%s) map_size=%u\n",
253                                  off, len, ret, strerror(errno),
254                                  tdb->map_size));
255                         return -1;
256                 }
257 #endif
258         }
259         if (cv) {
260                 tdb_convert(buf, len);
261         }
262         return 0;
263 }
264
265
266
267 /*
268   do an unlocked scan of the hash table heads to find the next non-zero head. The value
269   will then be confirmed with the lock held
270 */
271 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
272 {
273         uint32_t h = *chain;
274         if (tdb->map_ptr) {
275                 for (;h < tdb->hash_size;h++) {
276                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
277                                 break;
278                         }
279                 }
280         } else {
281                 uint32_t off=0;
282                 for (;h < tdb->hash_size;h++) {
283                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
284                                 break;
285                         }
286                 }
287         }
288         (*chain) = h;
289 }
290
291
292 int tdb_munmap(struct tdb_context *tdb)
293 {
294         if (tdb->flags & TDB_INTERNAL)
295                 return 0;
296
297 #ifdef HAVE_MMAP
298         if (tdb->map_ptr) {
299                 int ret;
300
301                 ret = munmap(tdb->map_ptr, tdb->map_size);
302                 if (ret != 0)
303                         return ret;
304         }
305 #endif
306         tdb->map_ptr = NULL;
307         return 0;
308 }
309
310 /* If mmap isn't coherent, *everyone* must always mmap. */
311 static bool should_mmap(const struct tdb_context *tdb)
312 {
313 #ifdef HAVE_INCOHERENT_MMAP
314         return true;
315 #else
316         return !(tdb->flags & TDB_NOMMAP);
317 #endif
318 }
319
320 int tdb_mmap(struct tdb_context *tdb)
321 {
322         if (tdb->flags & TDB_INTERNAL)
323                 return 0;
324
325 #ifdef HAVE_MMAP
326         if (should_mmap(tdb)) {
327                 tdb->map_ptr = mmap(NULL, tdb->map_size,
328                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
329                                     MAP_SHARED|MAP_FILE, tdb->fd,
330                                     tdb->hdr_ofs);
331
332                 /*
333                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
334                  */
335
336                 if (tdb->map_ptr == MAP_FAILED) {
337                         tdb->map_ptr = NULL;
338                         TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %u (%s)\n",
339                                  tdb->map_size, strerror(errno)));
340 #ifdef HAVE_INCOHERENT_MMAP
341                         tdb->ecode = TDB_ERR_IO;
342                         return -1;
343 #endif
344                 }
345         } else {
346                 tdb->map_ptr = NULL;
347         }
348 #else
349         tdb->map_ptr = NULL;
350 #endif
351         return 0;
352 }
353
354 /* expand a file.  we prefer to use ftruncate, as that is what posix
355   says to use for mmap expansion */
356 static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
357 {
358         char buf[8192];
359         tdb_off_t new_size;
360
361         if (tdb->read_only || tdb->traverse_read) {
362                 tdb->ecode = TDB_ERR_RDONLY;
363                 return -1;
364         }
365
366         if (!tdb_add_off_t(size, addition, &new_size)) {
367                 tdb->ecode = TDB_ERR_OOM;
368                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
369                         "overflow detected current size[%u] addition[%u]!\n",
370                         (unsigned)size, (unsigned)addition));
371                 errno = ENOSPC;
372                 return -1;
373         }
374
375         if (tdb_ftruncate(tdb, new_size) == -1) {
376                 char b = 0;
377                 ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1);
378                 if (written == 0) {
379                         /* try once more, potentially revealing errno */
380                         written = tdb_pwrite(tdb, &b, 1, new_size - 1);
381                 }
382                 if (written == 0) {
383                         /* again - give up, guessing errno */
384                         errno = ENOSPC;
385                 }
386                 if (written != 1) {
387                         tdb->ecode = TDB_ERR_OOM;
388                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %u failed (%s)\n",
389                                  (unsigned)new_size, strerror(errno)));
390                         return -1;
391                 }
392         }
393
394         /* now fill the file with something. This ensures that the
395            file isn't sparse, which would be very bad if we ran out of
396            disk. This must be done with write, not via mmap */
397         memset(buf, TDB_PAD_BYTE, sizeof(buf));
398         while (addition) {
399                 size_t n = addition>sizeof(buf)?sizeof(buf):addition;
400                 ssize_t written = tdb_pwrite(tdb, buf, n, size);
401                 if (written == 0) {
402                         /* prevent infinite loops: try _once_ more */
403                         written = tdb_pwrite(tdb, buf, n, size);
404                 }
405                 if (written == 0) {
406                         /* give up, trying to provide a useful errno */
407                         tdb->ecode = TDB_ERR_OOM;
408                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
409                                 "returned 0 twice: giving up!\n"));
410                         errno = ENOSPC;
411                         return -1;
412                 }
413                 if (written == -1) {
414                         tdb->ecode = TDB_ERR_OOM;
415                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
416                                  "%u bytes failed (%s)\n", (int)n,
417                                  strerror(errno)));
418                         return -1;
419                 }
420                 if (written != n) {
421                         TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
422                                  "only %zu of %zi bytes - retrying\n", written,
423                                  n));
424                 }
425                 addition -= written;
426                 size += written;
427         }
428         return 0;
429 }
430
431
432 /* You need 'size', this tells you how much you should expand by. */
433 tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
434 {
435         tdb_off_t new_size, top_size, increment;
436         tdb_off_t max_size = UINT32_MAX - map_size;
437
438         if (size > max_size) {
439                 /*
440                  * We can't round up anymore, just give back
441                  * what we're asked for.
442                  *
443                  * The caller has to take care of the ENOSPC handling.
444                  */
445                 return size;
446         }
447
448         /* limit size in order to avoid using up huge amounts of memory for
449          * in memory tdbs if an oddball huge record creeps in */
450         if (size > 100 * 1024) {
451                 increment = size * 2;
452         } else {
453                 increment = size * 100;
454         }
455         if (increment < size) {
456                 goto overflow;
457         }
458
459         if (!tdb_add_off_t(map_size, increment, &top_size)) {
460                 goto overflow;
461         }
462
463         /* always make room for at least top_size more records, and at
464            least 25% more space. if the DB is smaller than 100MiB,
465            otherwise grow it by 10% only. */
466         if (map_size > 100 * 1024 * 1024) {
467                 new_size = map_size * 1.10;
468         } else {
469                 new_size = map_size * 1.25;
470         }
471         if (new_size < map_size) {
472                 goto overflow;
473         }
474
475         /* Round the database up to a multiple of the page size */
476         new_size = MAX(top_size, new_size);
477
478         if (new_size + page_size < new_size) {
479                 /* There's a "+" in TDB_ALIGN that might overflow... */
480                 goto overflow;
481         }
482
483         return TDB_ALIGN(new_size, page_size) - map_size;
484
485 overflow:
486         /*
487          * Somewhere in between we went over 4GB. Make one big jump to
488          * exactly 4GB database size.
489          */
490         return max_size;
491 }
492
493 /* expand the database at least size bytes by expanding the underlying
494    file and doing the mmap again if necessary */
495 int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
496 {
497         struct tdb_record rec;
498         tdb_off_t offset;
499         tdb_off_t new_size;
500
501         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
502                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
503                 return -1;
504         }
505
506         /* must know about any previous expansions by another process */
507         tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
508
509         /*
510          * Note: that we don't care about tdb->hdr_ofs != 0 here
511          *
512          * The 4GB limitation is just related to tdb->map_size
513          * and the offset calculation in the records.
514          *
515          * The file on disk can be up to 4GB + tdb->hdr_ofs
516          */
517         size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
518
519         if (!tdb_add_off_t(tdb->map_size, size, &new_size)) {
520                 tdb->ecode = TDB_ERR_OOM;
521                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_expand "
522                         "overflow detected current map_size[%u] size[%u]!\n",
523                         (unsigned)tdb->map_size, (unsigned)size));
524                 goto fail;
525         }
526
527         /* form a new freelist record */
528         offset = tdb->map_size;
529         memset(&rec,'\0',sizeof(rec));
530         rec.rec_len = size - sizeof(rec);
531
532         if (tdb->flags & TDB_INTERNAL) {
533                 char *new_map_ptr;
534
535                 new_map_ptr = (char *)realloc(tdb->map_ptr, new_size);
536                 if (!new_map_ptr) {
537                         tdb->ecode = TDB_ERR_OOM;
538                         goto fail;
539                 }
540                 tdb->map_ptr = new_map_ptr;
541                 tdb->map_size = new_size;
542         } else {
543                 int ret;
544
545                 /*
546                  * expand the file itself
547                  */
548                 ret = tdb->methods->tdb_expand_file(tdb, tdb->map_size, size);
549                 if (ret != 0) {
550                         goto fail;
551                 }
552
553                 /* Explicitly remap: if we're in a transaction, this won't
554                  * happen automatically! */
555                 tdb_munmap(tdb);
556                 tdb->map_size = new_size;
557                 if (tdb_mmap(tdb) != 0) {
558                         goto fail;
559                 }
560         }
561
562         /* link it into the free list */
563         if (tdb_free(tdb, offset, &rec) == -1)
564                 goto fail;
565
566         tdb_unlock(tdb, -1, F_WRLCK);
567         return 0;
568  fail:
569         tdb_unlock(tdb, -1, F_WRLCK);
570         return -1;
571 }
572
573 /* read/write a tdb_off_t */
574 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
575 {
576         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
577 }
578
579 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
580 {
581         tdb_off_t off = *d;
582         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
583 }
584
585
586 /* read a lump of data, allocating the space for it */
587 unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
588 {
589         unsigned char *buf;
590
591         /* some systems don't like zero length malloc */
592
593         if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
594                 /* Ensure ecode is set for log fn. */
595                 tdb->ecode = TDB_ERR_OOM;
596                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%u (%s)\n",
597                            len, strerror(errno)));
598                 return NULL;
599         }
600         if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
601                 SAFE_FREE(buf);
602                 return NULL;
603         }
604         return buf;
605 }
606
607 /* Give a piece of tdb data to a parser */
608
609 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
610                    tdb_off_t offset, tdb_len_t len,
611                    int (*parser)(TDB_DATA key, TDB_DATA data,
612                                  void *private_data),
613                    void *private_data)
614 {
615         TDB_DATA data;
616         int result;
617
618         data.dsize = len;
619
620         if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
621                 /*
622                  * Optimize by avoiding the malloc/memcpy/free, point the
623                  * parser directly at the mmap area.
624                  */
625                 if (tdb->methods->tdb_oob(tdb, offset, len, 0) != 0) {
626                         return -1;
627                 }
628                 data.dptr = offset + (unsigned char *)tdb->map_ptr;
629                 return parser(key, data, private_data);
630         }
631
632         if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
633                 return -1;
634         }
635
636         result = parser(key, data, private_data);
637         free(data.dptr);
638         return result;
639 }
640
641 /* read/write a record */
642 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
643 {
644         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
645                 return -1;
646         if (TDB_BAD_MAGIC(rec)) {
647                 /* Ensure ecode is set for log fn. */
648                 tdb->ecode = TDB_ERR_CORRUPT;
649                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%u\n", rec->magic, offset));
650                 return -1;
651         }
652         return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0);
653 }
654
655 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
656 {
657         struct tdb_record r = *rec;
658         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
659 }
660
661 static const struct tdb_methods io_methods = {
662         tdb_read,
663         tdb_write,
664         tdb_next_hash_chain,
665         tdb_oob,
666         tdb_expand_file,
667 };
668
669 /*
670   initialise the default methods table
671 */
672 void tdb_io_init(struct tdb_context *tdb)
673 {
674         tdb->methods = &io_methods;
675 }