tdb: avoid many fcntl calls when incrementing seqnum
[samba.git] / lib / tdb / common / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9
10      ** NOTE! The following LGPL license applies to the tdb
11      ** library. This does NOT imply that all of Samba is released
12      ** under the LGPL
13
14    This library is free software; you can redistribute it and/or
15    modify it under the terms of the GNU Lesser General Public
16    License as published by the Free Software Foundation; either
17    version 3 of the License, or (at your option) any later version.
18
19    This library is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    Lesser General Public License for more details.
23
24    You should have received a copy of the GNU Lesser General Public
25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28
29 #include "tdb_private.h"
30
31 /*
32  * We prepend the mutex area, so fixup offsets. See mutex.c for details.
33  * tdb->hdr_ofs is 0 or header.mutex_size.
34  *
35  * Note: that we only have the 4GB limit of tdb_off_t for
36  * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
37  */
38
39 static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
40 {
41         off_t tmp = tdb->hdr_ofs + *off;
42
43         if ((tmp < tdb->hdr_ofs) || (tmp < *off)) {
44                 errno = EIO;
45                 return false;
46         }
47
48         *off = tmp;
49         return true;
50 }
51
52 static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
53                           size_t count, off_t offset)
54 {
55         if (!tdb_adjust_offset(tdb, &offset)) {
56                 return -1;
57         }
58         return pwrite(tdb->fd, buf, count, offset);
59 }
60
61 static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
62                          size_t count, off_t offset)
63 {
64         if (!tdb_adjust_offset(tdb, &offset)) {
65                 return -1;
66         }
67         return pread(tdb->fd, buf, count, offset);
68 }
69
70 static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
71 {
72         if (!tdb_adjust_offset(tdb, &length)) {
73                 return -1;
74         }
75         return ftruncate(tdb->fd, length);
76 }
77
78 static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
79 {
80         int ret;
81
82         ret = fstat(tdb->fd, buf);
83         if (ret == -1) {
84                 return -1;
85         }
86
87         if (buf->st_size < tdb->hdr_ofs) {
88                 errno = EIO;
89                 return -1;
90         }
91         buf->st_size -= tdb->hdr_ofs;
92
93         return ret;
94 }
95
96 /* check for an out of bounds access - if it is out of bounds then
97    see if the database has been expanded by someone else and expand
98    if necessary
99 */
100 static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len,
101                    int probe)
102 {
103         struct stat st;
104         if (len + off < len) {
105                 if (!probe) {
106                         /* Ensure ecode is set for log fn. */
107                         tdb->ecode = TDB_ERR_IO;
108                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %u len %u wrap\n",
109                                  off, len));
110                 }
111                 return -1;
112         }
113
114         if (off + len <= tdb->map_size)
115                 return 0;
116         if (tdb->flags & TDB_INTERNAL) {
117                 if (!probe) {
118                         /* Ensure ecode is set for log fn. */
119                         tdb->ecode = TDB_ERR_IO;
120                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond internal malloc size %u\n",
121                                  (int)(off + len), (int)tdb->map_size));
122                 }
123                 return -1;
124         }
125
126         if (tdb_fstat(tdb, &st) == -1) {
127                 tdb->ecode = TDB_ERR_IO;
128                 return -1;
129         }
130
131         /* Beware >4G files! */
132         if ((tdb_off_t)st.st_size != st.st_size) {
133                 /* Ensure ecode is set for log fn. */
134                 tdb->ecode = TDB_ERR_IO;
135                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_oob len %llu too large!\n",
136                          (long long)st.st_size));
137                 return -1;
138         }
139
140         /* Unmap, update size, remap.  We do this unconditionally, to handle
141          * the unusual case where the db is truncated.
142          *
143          * This can happen to a child using tdb_reopen_all(true) on a
144          * TDB_CLEAR_IF_FIRST tdb whose parent crashes: the next
145          * opener will truncate the database. */
146         if (tdb_munmap(tdb) == -1) {
147                 tdb->ecode = TDB_ERR_IO;
148                 return -1;
149         }
150         tdb->map_size = st.st_size;
151         if (tdb_mmap(tdb) != 0) {
152                 return -1;
153         }
154
155         if (st.st_size < (size_t)off + len) {
156                 if (!probe) {
157                         /* Ensure ecode is set for log fn. */
158                         tdb->ecode = TDB_ERR_IO;
159                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n",
160                                  (int)(off + len), (int)st.st_size));
161                 }
162                 return -1;
163         }
164         return 0;
165 }
166
167 /* write a lump of data at a specified offset */
168 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
169                      const void *buf, tdb_len_t len)
170 {
171         if (len == 0) {
172                 return 0;
173         }
174
175         if (tdb->read_only || tdb->traverse_read) {
176                 tdb->ecode = TDB_ERR_RDONLY;
177                 return -1;
178         }
179
180         if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0)
181                 return -1;
182
183         if (tdb->map_ptr) {
184                 memcpy(off + (char *)tdb->map_ptr, buf, len);
185         } else {
186 #ifdef HAVE_INCOHERENT_MMAP
187                 tdb->ecode = TDB_ERR_IO;
188                 return -1;
189 #else
190                 ssize_t written;
191
192                 written = tdb_pwrite(tdb, buf, len, off);
193
194                 if ((written != (ssize_t)len) && (written != -1)) {
195                         /* try once more */
196                         tdb->ecode = TDB_ERR_IO;
197                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
198                                  "%zi of %u bytes at %u, trying once more\n",
199                                  written, len, off));
200                         written = tdb_pwrite(tdb, (const char *)buf+written,
201                                              len-written, off+written);
202                 }
203                 if (written == -1) {
204                         /* Ensure ecode is set for log fn. */
205                         tdb->ecode = TDB_ERR_IO;
206                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %u "
207                                  "len=%u (%s)\n", off, len, strerror(errno)));
208                         return -1;
209                 } else if (written != (ssize_t)len) {
210                         tdb->ecode = TDB_ERR_IO;
211                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
212                                  "write %u bytes at %u in two attempts\n",
213                                  len, off));
214                         return -1;
215                 }
216 #endif
217         }
218         return 0;
219 }
220
221 /* Endian conversion: we only ever deal with 4 byte quantities */
222 void *tdb_convert(void *buf, uint32_t size)
223 {
224         uint32_t i, *p = (uint32_t *)buf;
225         for (i = 0; i < size / 4; i++)
226                 p[i] = TDB_BYTEREV(p[i]);
227         return buf;
228 }
229
230
231 /* read a lump of data at a specified offset, maybe convert */
232 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
233                     tdb_len_t len, int cv)
234 {
235         if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0) {
236                 return -1;
237         }
238
239         if (tdb->map_ptr) {
240                 memcpy(buf, off + (char *)tdb->map_ptr, len);
241         } else {
242 #ifdef HAVE_INCOHERENT_MMAP
243                 tdb->ecode = TDB_ERR_IO;
244                 return -1;
245 #else
246                 ssize_t ret;
247
248                 ret = tdb_pread(tdb, buf, len, off);
249                 if (ret != (ssize_t)len) {
250                         /* Ensure ecode is set for log fn. */
251                         tdb->ecode = TDB_ERR_IO;
252                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %u "
253                                  "len=%u ret=%zi (%s) map_size=%u\n",
254                                  off, len, ret, strerror(errno),
255                                  tdb->map_size));
256                         return -1;
257                 }
258 #endif
259         }
260         if (cv) {
261                 tdb_convert(buf, len);
262         }
263         return 0;
264 }
265
266
267
268 /*
269   do an unlocked scan of the hash table heads to find the next non-zero head. The value
270   will then be confirmed with the lock held
271 */
272 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
273 {
274         uint32_t h = *chain;
275         if (tdb->map_ptr) {
276                 for (;h < tdb->hash_size;h++) {
277                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
278                                 break;
279                         }
280                 }
281         } else {
282                 uint32_t off=0;
283                 for (;h < tdb->hash_size;h++) {
284                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
285                                 break;
286                         }
287                 }
288         }
289         (*chain) = h;
290 }
291
292
293 int tdb_munmap(struct tdb_context *tdb)
294 {
295         if (tdb->flags & TDB_INTERNAL)
296                 return 0;
297
298 #ifdef HAVE_MMAP
299         if (tdb->map_ptr) {
300                 int ret;
301
302                 ret = munmap(tdb->map_ptr, tdb->map_size);
303                 if (ret != 0)
304                         return ret;
305         }
306 #endif
307         tdb->map_ptr = NULL;
308         return 0;
309 }
310
311 /* If mmap isn't coherent, *everyone* must always mmap. */
312 static bool should_mmap(const struct tdb_context *tdb)
313 {
314 #ifdef HAVE_INCOHERENT_MMAP
315         return true;
316 #else
317         return !(tdb->flags & TDB_NOMMAP);
318 #endif
319 }
320
321 int tdb_mmap(struct tdb_context *tdb)
322 {
323         if (tdb->flags & TDB_INTERNAL)
324                 return 0;
325
326 #ifdef HAVE_MMAP
327         if (should_mmap(tdb)) {
328                 tdb->map_ptr = mmap(NULL, tdb->map_size,
329                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
330                                     MAP_SHARED|MAP_FILE, tdb->fd,
331                                     tdb->hdr_ofs);
332
333                 /*
334                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
335                  */
336
337                 if (tdb->map_ptr == MAP_FAILED) {
338                         tdb->map_ptr = NULL;
339                         TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %u (%s)\n",
340                                  tdb->map_size, strerror(errno)));
341 #ifdef HAVE_INCOHERENT_MMAP
342                         tdb->ecode = TDB_ERR_IO;
343                         return -1;
344 #endif
345                 }
346         } else {
347                 tdb->map_ptr = NULL;
348         }
349 #else
350         tdb->map_ptr = NULL;
351 #endif
352         return 0;
353 }
354
355 /* expand a file.  we prefer to use ftruncate, as that is what posix
356   says to use for mmap expansion */
357 static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
358 {
359         char buf[8192];
360         tdb_off_t new_size;
361
362         if (tdb->read_only || tdb->traverse_read) {
363                 tdb->ecode = TDB_ERR_RDONLY;
364                 return -1;
365         }
366
367         if (!tdb_add_off_t(size, addition, &new_size)) {
368                 tdb->ecode = TDB_ERR_OOM;
369                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
370                         "overflow detected current size[%u] addition[%u]!\n",
371                         (unsigned)size, (unsigned)addition));
372                 errno = ENOSPC;
373                 return -1;
374         }
375
376         if (tdb_ftruncate(tdb, new_size) == -1) {
377                 char b = 0;
378                 ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1);
379                 if (written == 0) {
380                         /* try once more, potentially revealing errno */
381                         written = tdb_pwrite(tdb, &b, 1, new_size - 1);
382                 }
383                 if (written == 0) {
384                         /* again - give up, guessing errno */
385                         errno = ENOSPC;
386                 }
387                 if (written != 1) {
388                         tdb->ecode = TDB_ERR_OOM;
389                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %u failed (%s)\n",
390                                  (unsigned)new_size, strerror(errno)));
391                         return -1;
392                 }
393         }
394
395         /* now fill the file with something. This ensures that the
396            file isn't sparse, which would be very bad if we ran out of
397            disk. This must be done with write, not via mmap */
398         memset(buf, TDB_PAD_BYTE, sizeof(buf));
399         while (addition) {
400                 size_t n = addition>sizeof(buf)?sizeof(buf):addition;
401                 ssize_t written = tdb_pwrite(tdb, buf, n, size);
402                 if (written == 0) {
403                         /* prevent infinite loops: try _once_ more */
404                         written = tdb_pwrite(tdb, buf, n, size);
405                 }
406                 if (written == 0) {
407                         /* give up, trying to provide a useful errno */
408                         tdb->ecode = TDB_ERR_OOM;
409                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
410                                 "returned 0 twice: giving up!\n"));
411                         errno = ENOSPC;
412                         return -1;
413                 }
414                 if (written == -1) {
415                         tdb->ecode = TDB_ERR_OOM;
416                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
417                                  "%u bytes failed (%s)\n", (int)n,
418                                  strerror(errno)));
419                         return -1;
420                 }
421                 if (written != n) {
422                         TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
423                                  "only %zu of %zi bytes - retrying\n", written,
424                                  n));
425                 }
426                 addition -= written;
427                 size += written;
428         }
429         return 0;
430 }
431
432
433 /* You need 'size', this tells you how much you should expand by. */
434 tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
435 {
436         tdb_off_t new_size, top_size, increment;
437         tdb_off_t max_size = UINT32_MAX - map_size;
438
439         if (size > max_size) {
440                 /*
441                  * We can't round up anymore, just give back
442                  * what we're asked for.
443                  *
444                  * The caller has to take care of the ENOSPC handling.
445                  */
446                 return size;
447         }
448
449         /* limit size in order to avoid using up huge amounts of memory for
450          * in memory tdbs if an oddball huge record creeps in */
451         if (size > 100 * 1024) {
452                 increment = size * 2;
453         } else {
454                 increment = size * 100;
455         }
456         if (increment < size) {
457                 goto overflow;
458         }
459
460         if (!tdb_add_off_t(map_size, increment, &top_size)) {
461                 goto overflow;
462         }
463
464         /* always make room for at least top_size more records, and at
465            least 25% more space. if the DB is smaller than 100MiB,
466            otherwise grow it by 10% only. */
467         if (map_size > 100 * 1024 * 1024) {
468                 new_size = map_size * 1.10;
469         } else {
470                 new_size = map_size * 1.25;
471         }
472         if (new_size < map_size) {
473                 goto overflow;
474         }
475
476         /* Round the database up to a multiple of the page size */
477         new_size = MAX(top_size, new_size);
478
479         if (new_size + page_size < new_size) {
480                 /* There's a "+" in TDB_ALIGN that might overflow... */
481                 goto overflow;
482         }
483
484         return TDB_ALIGN(new_size, page_size) - map_size;
485
486 overflow:
487         /*
488          * Somewhere in between we went over 4GB. Make one big jump to
489          * exactly 4GB database size.
490          */
491         return max_size;
492 }
493
494 /* expand the database at least size bytes by expanding the underlying
495    file and doing the mmap again if necessary */
496 int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
497 {
498         struct tdb_record rec;
499         tdb_off_t offset;
500         tdb_off_t new_size;
501
502         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
503                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
504                 return -1;
505         }
506
507         /* must know about any previous expansions by another process */
508         tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
509
510         /*
511          * Note: that we don't care about tdb->hdr_ofs != 0 here
512          *
513          * The 4GB limitation is just related to tdb->map_size
514          * and the offset calculation in the records.
515          *
516          * The file on disk can be up to 4GB + tdb->hdr_ofs
517          */
518         size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
519
520         if (!tdb_add_off_t(tdb->map_size, size, &new_size)) {
521                 tdb->ecode = TDB_ERR_OOM;
522                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_expand "
523                         "overflow detected current map_size[%u] size[%u]!\n",
524                         (unsigned)tdb->map_size, (unsigned)size));
525                 goto fail;
526         }
527
528         /* form a new freelist record */
529         offset = tdb->map_size;
530         memset(&rec,'\0',sizeof(rec));
531         rec.rec_len = size - sizeof(rec);
532
533         if (tdb->flags & TDB_INTERNAL) {
534                 char *new_map_ptr;
535
536                 new_map_ptr = (char *)realloc(tdb->map_ptr, new_size);
537                 if (!new_map_ptr) {
538                         tdb->ecode = TDB_ERR_OOM;
539                         goto fail;
540                 }
541                 tdb->map_ptr = new_map_ptr;
542                 tdb->map_size = new_size;
543         } else {
544                 int ret;
545
546                 /*
547                  * expand the file itself
548                  */
549                 ret = tdb->methods->tdb_expand_file(tdb, tdb->map_size, size);
550                 if (ret != 0) {
551                         goto fail;
552                 }
553
554                 /* Explicitly remap: if we're in a transaction, this won't
555                  * happen automatically! */
556                 tdb_munmap(tdb);
557                 tdb->map_size = new_size;
558                 if (tdb_mmap(tdb) != 0) {
559                         goto fail;
560                 }
561         }
562
563         /* link it into the free list */
564         if (tdb_free(tdb, offset, &rec) == -1)
565                 goto fail;
566
567         tdb_unlock(tdb, -1, F_WRLCK);
568         return 0;
569  fail:
570         tdb_unlock(tdb, -1, F_WRLCK);
571         return -1;
572 }
573
574 /* read/write a tdb_off_t */
575 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
576 {
577         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
578 }
579
580 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
581 {
582         tdb_off_t off = *d;
583         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
584 }
585
586
587 /* read a lump of data, allocating the space for it */
588 unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
589 {
590         unsigned char *buf;
591
592         /* some systems don't like zero length malloc */
593
594         if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
595                 /* Ensure ecode is set for log fn. */
596                 tdb->ecode = TDB_ERR_OOM;
597                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%u (%s)\n",
598                            len, strerror(errno)));
599                 return NULL;
600         }
601         if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
602                 SAFE_FREE(buf);
603                 return NULL;
604         }
605         return buf;
606 }
607
608 /* Give a piece of tdb data to a parser */
609
610 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
611                    tdb_off_t offset, tdb_len_t len,
612                    int (*parser)(TDB_DATA key, TDB_DATA data,
613                                  void *private_data),
614                    void *private_data)
615 {
616         TDB_DATA data;
617         int result;
618
619         data.dsize = len;
620
621         if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
622                 /*
623                  * Optimize by avoiding the malloc/memcpy/free, point the
624                  * parser directly at the mmap area.
625                  */
626                 if (tdb->methods->tdb_oob(tdb, offset, len, 0) != 0) {
627                         return -1;
628                 }
629                 data.dptr = offset + (unsigned char *)tdb->map_ptr;
630                 return parser(key, data, private_data);
631         }
632
633         if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
634                 return -1;
635         }
636
637         result = parser(key, data, private_data);
638         free(data.dptr);
639         return result;
640 }
641
642 /* read/write a record */
643 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
644 {
645         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
646                 return -1;
647         if (TDB_BAD_MAGIC(rec)) {
648                 /* Ensure ecode is set for log fn. */
649                 tdb->ecode = TDB_ERR_CORRUPT;
650                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%u\n", rec->magic, offset));
651                 return -1;
652         }
653         return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0);
654 }
655
656 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
657 {
658         struct tdb_record r = *rec;
659         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
660 }
661
662 static const struct tdb_methods io_methods = {
663         tdb_read,
664         tdb_write,
665         tdb_next_hash_chain,
666         tdb_oob,
667         tdb_expand_file,
668 };
669
670 /*
671   initialise the default methods table
672 */
673 void tdb_io_init(struct tdb_context *tdb)
674 {
675         tdb->methods = &io_methods;
676 }