tdb: Protect against EINTR
[sfrench/samba-autobuild/.git] / lib / tdb / common / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9
10      ** NOTE! The following LGPL license applies to the tdb
11      ** library. This does NOT imply that all of Samba is released
12      ** under the LGPL
13
14    This library is free software; you can redistribute it and/or
15    modify it under the terms of the GNU Lesser General Public
16    License as published by the Free Software Foundation; either
17    version 3 of the License, or (at your option) any later version.
18
19    This library is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    Lesser General Public License for more details.
23
24    You should have received a copy of the GNU Lesser General Public
25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28
29 #include "tdb_private.h"
30
31 /*
32  * We prepend the mutex area, so fixup offsets. See mutex.c for details.
33  * tdb->hdr_ofs is 0 or header.mutex_size.
34  *
35  * Note: that we only have the 4GB limit of tdb_off_t for
36  * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
37  */
38
39 static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
40 {
41         off_t tmp = tdb->hdr_ofs + *off;
42
43         if ((tmp < tdb->hdr_ofs) || (tmp < *off)) {
44                 errno = EIO;
45                 return false;
46         }
47
48         *off = tmp;
49         return true;
50 }
51
52 static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
53                           size_t count, off_t offset)
54 {
55         ssize_t ret;
56
57         if (!tdb_adjust_offset(tdb, &offset)) {
58                 return -1;
59         }
60
61         do {
62                 ret = pwrite(tdb->fd, buf, count, offset);
63         } while ((ret == -1) && (errno == EINTR));
64
65         return ret;
66 }
67
68 static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
69                          size_t count, off_t offset)
70 {
71         ssize_t ret;
72
73         if (!tdb_adjust_offset(tdb, &offset)) {
74                 return -1;
75         }
76
77         do {
78                 ret = pread(tdb->fd, buf, count, offset);
79         } while ((ret == -1) && (errno == EINTR));
80
81         return ret;
82 }
83
84 static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
85 {
86         ssize_t ret;
87
88         if (!tdb_adjust_offset(tdb, &length)) {
89                 return -1;
90         }
91
92         do {
93                 ret = ftruncate(tdb->fd, length);
94         } while ((ret == -1) && (errno == EINTR));
95
96         return ret;
97 }
98
99 static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
100 {
101         int ret;
102
103         ret = fstat(tdb->fd, buf);
104         if (ret == -1) {
105                 return -1;
106         }
107
108         if (buf->st_size < tdb->hdr_ofs) {
109                 errno = EIO;
110                 return -1;
111         }
112         buf->st_size -= tdb->hdr_ofs;
113
114         return ret;
115 }
116
117 /* check for an out of bounds access - if it is out of bounds then
118    see if the database has been expanded by someone else and expand
119    if necessary
120 */
121 static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len,
122                    int probe)
123 {
124         struct stat st;
125         if (len + off < len) {
126                 if (!probe) {
127                         /* Ensure ecode is set for log fn. */
128                         tdb->ecode = TDB_ERR_IO;
129                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %u len %u wrap\n",
130                                  off, len));
131                 }
132                 return -1;
133         }
134
135         if (off + len <= tdb->map_size)
136                 return 0;
137         if (tdb->flags & TDB_INTERNAL) {
138                 if (!probe) {
139                         /* Ensure ecode is set for log fn. */
140                         tdb->ecode = TDB_ERR_IO;
141                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond internal malloc size %u\n",
142                                  (int)(off + len), (int)tdb->map_size));
143                 }
144                 return -1;
145         }
146
147         if (tdb_fstat(tdb, &st) == -1) {
148                 tdb->ecode = TDB_ERR_IO;
149                 return -1;
150         }
151
152         /* Beware >4G files! */
153         if ((tdb_off_t)st.st_size != st.st_size) {
154                 /* Ensure ecode is set for log fn. */
155                 tdb->ecode = TDB_ERR_IO;
156                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_oob len %llu too large!\n",
157                          (long long)st.st_size));
158                 return -1;
159         }
160
161         /* Unmap, update size, remap.  We do this unconditionally, to handle
162          * the unusual case where the db is truncated.
163          *
164          * This can happen to a child using tdb_reopen_all(true) on a
165          * TDB_CLEAR_IF_FIRST tdb whose parent crashes: the next
166          * opener will truncate the database. */
167         if (tdb_munmap(tdb) == -1) {
168                 tdb->ecode = TDB_ERR_IO;
169                 return -1;
170         }
171         tdb->map_size = st.st_size;
172         if (tdb_mmap(tdb) != 0) {
173                 return -1;
174         }
175
176         if (st.st_size < (size_t)off + len) {
177                 if (!probe) {
178                         /* Ensure ecode is set for log fn. */
179                         tdb->ecode = TDB_ERR_IO;
180                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n",
181                                  (int)(off + len), (int)st.st_size));
182                 }
183                 return -1;
184         }
185         return 0;
186 }
187
188 /* write a lump of data at a specified offset */
189 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
190                      const void *buf, tdb_len_t len)
191 {
192         if (len == 0) {
193                 return 0;
194         }
195
196         if (tdb->read_only || tdb->traverse_read) {
197                 tdb->ecode = TDB_ERR_RDONLY;
198                 return -1;
199         }
200
201         if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0)
202                 return -1;
203
204         if (tdb->map_ptr) {
205                 memcpy(off + (char *)tdb->map_ptr, buf, len);
206         } else {
207 #ifdef HAVE_INCOHERENT_MMAP
208                 tdb->ecode = TDB_ERR_IO;
209                 return -1;
210 #else
211                 ssize_t written;
212
213                 written = tdb_pwrite(tdb, buf, len, off);
214
215                 if ((written != (ssize_t)len) && (written != -1)) {
216                         /* try once more */
217                         tdb->ecode = TDB_ERR_IO;
218                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
219                                  "%zi of %u bytes at %u, trying once more\n",
220                                  written, len, off));
221                         written = tdb_pwrite(tdb, (const char *)buf+written,
222                                              len-written, off+written);
223                 }
224                 if (written == -1) {
225                         /* Ensure ecode is set for log fn. */
226                         tdb->ecode = TDB_ERR_IO;
227                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %u "
228                                  "len=%u (%s)\n", off, len, strerror(errno)));
229                         return -1;
230                 } else if (written != (ssize_t)len) {
231                         tdb->ecode = TDB_ERR_IO;
232                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
233                                  "write %u bytes at %u in two attempts\n",
234                                  len, off));
235                         return -1;
236                 }
237 #endif
238         }
239         return 0;
240 }
241
242 /* Endian conversion: we only ever deal with 4 byte quantities */
243 void *tdb_convert(void *buf, uint32_t size)
244 {
245         uint32_t i, *p = (uint32_t *)buf;
246         for (i = 0; i < size / 4; i++)
247                 p[i] = TDB_BYTEREV(p[i]);
248         return buf;
249 }
250
251
252 /* read a lump of data at a specified offset, maybe convert */
253 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
254                     tdb_len_t len, int cv)
255 {
256         if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0) {
257                 return -1;
258         }
259
260         if (tdb->map_ptr) {
261                 memcpy(buf, off + (char *)tdb->map_ptr, len);
262         } else {
263 #ifdef HAVE_INCOHERENT_MMAP
264                 tdb->ecode = TDB_ERR_IO;
265                 return -1;
266 #else
267                 ssize_t ret;
268
269                 ret = tdb_pread(tdb, buf, len, off);
270                 if (ret != (ssize_t)len) {
271                         /* Ensure ecode is set for log fn. */
272                         tdb->ecode = TDB_ERR_IO;
273                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %u "
274                                  "len=%u ret=%zi (%s) map_size=%u\n",
275                                  off, len, ret, strerror(errno),
276                                  tdb->map_size));
277                         return -1;
278                 }
279 #endif
280         }
281         if (cv) {
282                 tdb_convert(buf, len);
283         }
284         return 0;
285 }
286
287
288
289 /*
290   do an unlocked scan of the hash table heads to find the next non-zero head. The value
291   will then be confirmed with the lock held
292 */
293 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
294 {
295         uint32_t h = *chain;
296         if (tdb->map_ptr) {
297                 for (;h < tdb->hash_size;h++) {
298                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
299                                 break;
300                         }
301                 }
302         } else {
303                 uint32_t off=0;
304                 for (;h < tdb->hash_size;h++) {
305                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
306                                 break;
307                         }
308                 }
309         }
310         (*chain) = h;
311 }
312
313
314 int tdb_munmap(struct tdb_context *tdb)
315 {
316         if (tdb->flags & TDB_INTERNAL)
317                 return 0;
318
319 #ifdef HAVE_MMAP
320         if (tdb->map_ptr) {
321                 int ret;
322
323                 ret = munmap(tdb->map_ptr, tdb->map_size);
324                 if (ret != 0)
325                         return ret;
326         }
327 #endif
328         tdb->map_ptr = NULL;
329         return 0;
330 }
331
332 /* If mmap isn't coherent, *everyone* must always mmap. */
333 static bool should_mmap(const struct tdb_context *tdb)
334 {
335 #ifdef HAVE_INCOHERENT_MMAP
336         return true;
337 #else
338         return !(tdb->flags & TDB_NOMMAP);
339 #endif
340 }
341
342 int tdb_mmap(struct tdb_context *tdb)
343 {
344         if (tdb->flags & TDB_INTERNAL)
345                 return 0;
346
347 #ifdef HAVE_MMAP
348         if (should_mmap(tdb)) {
349                 tdb->map_ptr = mmap(NULL, tdb->map_size,
350                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
351                                     MAP_SHARED|MAP_FILE, tdb->fd,
352                                     tdb->hdr_ofs);
353
354                 /*
355                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
356                  */
357
358                 if (tdb->map_ptr == MAP_FAILED) {
359                         tdb->map_ptr = NULL;
360                         TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %u (%s)\n",
361                                  tdb->map_size, strerror(errno)));
362 #ifdef HAVE_INCOHERENT_MMAP
363                         tdb->ecode = TDB_ERR_IO;
364                         return -1;
365 #endif
366                 }
367         } else {
368                 tdb->map_ptr = NULL;
369         }
370 #else
371         tdb->map_ptr = NULL;
372 #endif
373         return 0;
374 }
375
376 /* expand a file.  we prefer to use ftruncate, as that is what posix
377   says to use for mmap expansion */
378 static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
379 {
380         char buf[8192];
381         tdb_off_t new_size;
382
383         if (tdb->read_only || tdb->traverse_read) {
384                 tdb->ecode = TDB_ERR_RDONLY;
385                 return -1;
386         }
387
388         if (!tdb_add_off_t(size, addition, &new_size)) {
389                 tdb->ecode = TDB_ERR_OOM;
390                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
391                         "overflow detected current size[%u] addition[%u]!\n",
392                         (unsigned)size, (unsigned)addition));
393                 errno = ENOSPC;
394                 return -1;
395         }
396
397         if (tdb_ftruncate(tdb, new_size) == -1) {
398                 char b = 0;
399                 ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1);
400                 if (written == 0) {
401                         /* try once more, potentially revealing errno */
402                         written = tdb_pwrite(tdb, &b, 1, new_size - 1);
403                 }
404                 if (written == 0) {
405                         /* again - give up, guessing errno */
406                         errno = ENOSPC;
407                 }
408                 if (written != 1) {
409                         tdb->ecode = TDB_ERR_OOM;
410                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %u failed (%s)\n",
411                                  (unsigned)new_size, strerror(errno)));
412                         return -1;
413                 }
414         }
415
416         /* now fill the file with something. This ensures that the
417            file isn't sparse, which would be very bad if we ran out of
418            disk. This must be done with write, not via mmap */
419         memset(buf, TDB_PAD_BYTE, sizeof(buf));
420         while (addition) {
421                 size_t n = addition>sizeof(buf)?sizeof(buf):addition;
422                 ssize_t written = tdb_pwrite(tdb, buf, n, size);
423                 if (written == 0) {
424                         /* prevent infinite loops: try _once_ more */
425                         written = tdb_pwrite(tdb, buf, n, size);
426                 }
427                 if (written == 0) {
428                         /* give up, trying to provide a useful errno */
429                         tdb->ecode = TDB_ERR_OOM;
430                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
431                                 "returned 0 twice: giving up!\n"));
432                         errno = ENOSPC;
433                         return -1;
434                 }
435                 if (written == -1) {
436                         tdb->ecode = TDB_ERR_OOM;
437                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
438                                  "%u bytes failed (%s)\n", (int)n,
439                                  strerror(errno)));
440                         return -1;
441                 }
442                 if (written != n) {
443                         TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
444                                  "only %zu of %zi bytes - retrying\n", written,
445                                  n));
446                 }
447                 addition -= written;
448                 size += written;
449         }
450         return 0;
451 }
452
453
454 /* You need 'size', this tells you how much you should expand by. */
455 tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
456 {
457         tdb_off_t new_size, top_size, increment;
458         tdb_off_t max_size = UINT32_MAX - map_size;
459
460         if (size > max_size) {
461                 /*
462                  * We can't round up anymore, just give back
463                  * what we're asked for.
464                  *
465                  * The caller has to take care of the ENOSPC handling.
466                  */
467                 return size;
468         }
469
470         /* limit size in order to avoid using up huge amounts of memory for
471          * in memory tdbs if an oddball huge record creeps in */
472         if (size > 100 * 1024) {
473                 increment = size * 2;
474         } else {
475                 increment = size * 100;
476         }
477         if (increment < size) {
478                 goto overflow;
479         }
480
481         if (!tdb_add_off_t(map_size, increment, &top_size)) {
482                 goto overflow;
483         }
484
485         /* always make room for at least top_size more records, and at
486            least 25% more space. if the DB is smaller than 100MiB,
487            otherwise grow it by 10% only. */
488         if (map_size > 100 * 1024 * 1024) {
489                 new_size = map_size * 1.10;
490         } else {
491                 new_size = map_size * 1.25;
492         }
493         if (new_size < map_size) {
494                 goto overflow;
495         }
496
497         /* Round the database up to a multiple of the page size */
498         new_size = MAX(top_size, new_size);
499
500         if (new_size + page_size < new_size) {
501                 /* There's a "+" in TDB_ALIGN that might overflow... */
502                 goto overflow;
503         }
504
505         return TDB_ALIGN(new_size, page_size) - map_size;
506
507 overflow:
508         /*
509          * Somewhere in between we went over 4GB. Make one big jump to
510          * exactly 4GB database size.
511          */
512         return max_size;
513 }
514
515 /* expand the database at least size bytes by expanding the underlying
516    file and doing the mmap again if necessary */
517 int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
518 {
519         struct tdb_record rec;
520         tdb_off_t offset;
521         tdb_off_t new_size;
522
523         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
524                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
525                 return -1;
526         }
527
528         /* must know about any previous expansions by another process */
529         tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
530
531         /*
532          * Note: that we don't care about tdb->hdr_ofs != 0 here
533          *
534          * The 4GB limitation is just related to tdb->map_size
535          * and the offset calculation in the records.
536          *
537          * The file on disk can be up to 4GB + tdb->hdr_ofs
538          */
539         size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
540
541         if (!tdb_add_off_t(tdb->map_size, size, &new_size)) {
542                 tdb->ecode = TDB_ERR_OOM;
543                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_expand "
544                         "overflow detected current map_size[%u] size[%u]!\n",
545                         (unsigned)tdb->map_size, (unsigned)size));
546                 goto fail;
547         }
548
549         /* form a new freelist record */
550         offset = tdb->map_size;
551         memset(&rec,'\0',sizeof(rec));
552         rec.rec_len = size - sizeof(rec);
553
554         if (tdb->flags & TDB_INTERNAL) {
555                 char *new_map_ptr;
556
557                 new_map_ptr = (char *)realloc(tdb->map_ptr, new_size);
558                 if (!new_map_ptr) {
559                         tdb->ecode = TDB_ERR_OOM;
560                         goto fail;
561                 }
562                 tdb->map_ptr = new_map_ptr;
563                 tdb->map_size = new_size;
564         } else {
565                 int ret;
566
567                 /*
568                  * expand the file itself
569                  */
570                 ret = tdb->methods->tdb_expand_file(tdb, tdb->map_size, size);
571                 if (ret != 0) {
572                         goto fail;
573                 }
574
575                 /* Explicitly remap: if we're in a transaction, this won't
576                  * happen automatically! */
577                 tdb_munmap(tdb);
578                 tdb->map_size = new_size;
579                 if (tdb_mmap(tdb) != 0) {
580                         goto fail;
581                 }
582         }
583
584         /* link it into the free list */
585         if (tdb_free(tdb, offset, &rec) == -1)
586                 goto fail;
587
588         tdb_unlock(tdb, -1, F_WRLCK);
589         return 0;
590  fail:
591         tdb_unlock(tdb, -1, F_WRLCK);
592         return -1;
593 }
594
595 /* read/write a tdb_off_t */
596 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
597 {
598         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
599 }
600
601 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
602 {
603         tdb_off_t off = *d;
604         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
605 }
606
607
608 /* read a lump of data, allocating the space for it */
609 unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
610 {
611         unsigned char *buf;
612
613         /* some systems don't like zero length malloc */
614
615         if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
616                 /* Ensure ecode is set for log fn. */
617                 tdb->ecode = TDB_ERR_OOM;
618                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%u (%s)\n",
619                            len, strerror(errno)));
620                 return NULL;
621         }
622         if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
623                 SAFE_FREE(buf);
624                 return NULL;
625         }
626         return buf;
627 }
628
629 /* Give a piece of tdb data to a parser */
630
631 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
632                    tdb_off_t offset, tdb_len_t len,
633                    int (*parser)(TDB_DATA key, TDB_DATA data,
634                                  void *private_data),
635                    void *private_data)
636 {
637         TDB_DATA data;
638         int result;
639
640         data.dsize = len;
641
642         if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
643                 /*
644                  * Optimize by avoiding the malloc/memcpy/free, point the
645                  * parser directly at the mmap area.
646                  */
647                 if (tdb->methods->tdb_oob(tdb, offset, len, 0) != 0) {
648                         return -1;
649                 }
650                 data.dptr = offset + (unsigned char *)tdb->map_ptr;
651                 return parser(key, data, private_data);
652         }
653
654         if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
655                 return -1;
656         }
657
658         result = parser(key, data, private_data);
659         free(data.dptr);
660         return result;
661 }
662
663 /* read/write a record */
664 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
665 {
666         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
667                 return -1;
668         if (TDB_BAD_MAGIC(rec)) {
669                 /* Ensure ecode is set for log fn. */
670                 tdb->ecode = TDB_ERR_CORRUPT;
671                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%u\n", rec->magic, offset));
672                 return -1;
673         }
674         return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0);
675 }
676
677 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
678 {
679         struct tdb_record r = *rec;
680         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
681 }
682
683 static const struct tdb_methods io_methods = {
684         tdb_read,
685         tdb_write,
686         tdb_next_hash_chain,
687         tdb_oob,
688         tdb_expand_file,
689 };
690
691 /*
692   initialise the default methods table
693 */
694 void tdb_io_init(struct tdb_context *tdb)
695 {
696         tdb->methods = &io_methods;
697 }