e1518062b1b8d09cd51201b5525bec74725bcbc5
[kai/samba-autobuild/.git] / lib / ntdb / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the ntdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void ntdb_munmap(struct ntdb_file *file)
33 {
34         if (file->fd == -1)
35                 return;
36
37         if (file->map_ptr) {
38                 munmap(file->map_ptr, file->map_size);
39                 file->map_ptr = NULL;
40         }
41 }
42
43 enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb)
44 {
45         int mmap_flags;
46
47         if (ntdb->flags & NTDB_INTERNAL)
48                 return NTDB_SUCCESS;
49
50 #ifndef HAVE_INCOHERENT_MMAP
51         if (ntdb->flags & NTDB_NOMMAP)
52                 return NTDB_SUCCESS;
53 #endif
54
55         if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY)
56                 mmap_flags = PROT_READ;
57         else
58                 mmap_flags = PROT_READ | PROT_WRITE;
59
60         /* size_t can be smaller than off_t. */
61         if ((size_t)ntdb->file->map_size == ntdb->file->map_size) {
62                 ntdb->file->map_ptr = mmap(NULL, ntdb->file->map_size,
63                                           mmap_flags,
64                                           MAP_SHARED, ntdb->file->fd, 0);
65         } else
66                 ntdb->file->map_ptr = MAP_FAILED;
67
68         /*
69          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
70          */
71         if (ntdb->file->map_ptr == MAP_FAILED) {
72                 ntdb->file->map_ptr = NULL;
73 #ifdef HAVE_INCOHERENT_MMAP
74                 /* Incoherent mmap means everyone must mmap! */
75                 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
76                                   "ntdb_mmap failed for size %lld (%s)",
77                                   (long long)ntdb->file->map_size,
78                                   strerror(errno));
79 #else
80                 ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
81                            "ntdb_mmap failed for size %lld (%s)",
82                            (long long)ntdb->file->map_size, strerror(errno));
83 #endif
84         }
85         return NTDB_SUCCESS;
86 }
87
88 /* check for an out of bounds access - if it is out of bounds then
89    see if the database has been expanded by someone else and expand
90    if necessary
91    note that "len" is the minimum length needed for the db.
92
93    If probe is true, len being too large isn't a failure.
94 */
95 static enum NTDB_ERROR ntdb_oob(struct ntdb_context *ntdb,
96                               ntdb_off_t off, ntdb_len_t len, bool probe)
97 {
98         struct stat st;
99         enum NTDB_ERROR ecode;
100
101         /* We can't hold pointers during this: we could unmap! */
102         assert(!ntdb->direct_access
103                || (ntdb->flags & NTDB_NOLOCK)
104                || ntdb_has_expansion_lock(ntdb));
105
106         if (len + off < len) {
107                 if (probe)
108                         return NTDB_SUCCESS;
109
110                 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
111                                   "ntdb_oob off %llu len %llu wrap\n",
112                                   (long long)off, (long long)len);
113         }
114
115         if (len + off <= ntdb->file->map_size)
116                 return NTDB_SUCCESS;
117         if (ntdb->flags & NTDB_INTERNAL) {
118                 if (probe)
119                         return NTDB_SUCCESS;
120
121                 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
122                            "ntdb_oob len %lld beyond internal"
123                            " malloc size %lld",
124                            (long long)(off + len),
125                            (long long)ntdb->file->map_size);
126                 return NTDB_ERR_IO;
127         }
128
129         ecode = ntdb_lock_expand(ntdb, F_RDLCK);
130         if (ecode != NTDB_SUCCESS) {
131                 return ecode;
132         }
133
134         if (fstat(ntdb->file->fd, &st) != 0) {
135                 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
136                            "Failed to fstat file: %s", strerror(errno));
137                 ntdb_unlock_expand(ntdb, F_RDLCK);
138                 return NTDB_ERR_IO;
139         }
140
141         ntdb_unlock_expand(ntdb, F_RDLCK);
142
143         if (st.st_size < off + len) {
144                 if (probe)
145                         return NTDB_SUCCESS;
146
147                 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
148                            "ntdb_oob len %llu beyond eof at %llu",
149                            (long long)(off + len), (long long)st.st_size);
150                 return NTDB_ERR_IO;
151         }
152
153         /* Unmap, update size, remap */
154         ntdb_munmap(ntdb->file);
155
156         ntdb->file->map_size = st.st_size;
157         return ntdb_mmap(ntdb);
158 }
159
160 /* Endian conversion: we only ever deal with 8 byte quantities */
161 void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size)
162 {
163         assert(size % 8 == 0);
164         if (unlikely((ntdb->flags & NTDB_CONVERT)) && buf) {
165                 uint64_t i, *p = (uint64_t *)buf;
166                 for (i = 0; i < size / 8; i++)
167                         p[i] = bswap_64(p[i]);
168         }
169         return buf;
170 }
171
172 /* Return first non-zero offset in offset array, or end, or -ve error. */
173 /* FIXME: Return the off? */
174 uint64_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
175                               ntdb_off_t base, uint64_t start, uint64_t end)
176 {
177         uint64_t i;
178         const uint64_t *val;
179
180         /* Zero vs non-zero is the same unconverted: minor optimization. */
181         val = ntdb_access_read(ntdb, base + start * sizeof(ntdb_off_t),
182                               (end - start) * sizeof(ntdb_off_t), false);
183         if (NTDB_PTR_IS_ERR(val)) {
184                 return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
185         }
186
187         for (i = 0; i < (end - start); i++) {
188                 if (val[i])
189                         break;
190         }
191         ntdb_access_release(ntdb, val);
192         return start + i;
193 }
194
195 /* Return first zero offset in num offset array, or num, or -ve error. */
196 uint64_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
197                            uint64_t num)
198 {
199         uint64_t i;
200         const uint64_t *val;
201
202         /* Zero vs non-zero is the same unconverted: minor optimization. */
203         val = ntdb_access_read(ntdb, off, num * sizeof(ntdb_off_t), false);
204         if (NTDB_PTR_IS_ERR(val)) {
205                 return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
206         }
207
208         for (i = 0; i < num; i++) {
209                 if (!val[i])
210                         break;
211         }
212         ntdb_access_release(ntdb, val);
213         return i;
214 }
215
216 enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len)
217 {
218         char buf[8192] = { 0 };
219         void *p = ntdb->io->direct(ntdb, off, len, true);
220         enum NTDB_ERROR ecode = NTDB_SUCCESS;
221
222         assert(!(ntdb->flags & NTDB_RDONLY));
223         if (NTDB_PTR_IS_ERR(p)) {
224                 return NTDB_PTR_ERR(p);
225         }
226         if (p) {
227                 memset(p, 0, len);
228                 return ecode;
229         }
230         while (len) {
231                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
232                 ecode = ntdb->io->twrite(ntdb, off, buf, todo);
233                 if (ecode != NTDB_SUCCESS) {
234                         break;
235                 }
236                 len -= todo;
237                 off += todo;
238         }
239         return ecode;
240 }
241
242 ntdb_off_t ntdb_read_off(struct ntdb_context *ntdb, ntdb_off_t off)
243 {
244         ntdb_off_t ret;
245         enum NTDB_ERROR ecode;
246
247         if (likely(!(ntdb->flags & NTDB_CONVERT))) {
248                 ntdb_off_t *p = ntdb->io->direct(ntdb, off, sizeof(*p), false);
249                 if (NTDB_PTR_IS_ERR(p)) {
250                         return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p));
251                 }
252                 if (p)
253                         return *p;
254         }
255
256         ecode = ntdb_read_convert(ntdb, off, &ret, sizeof(ret));
257         if (ecode != NTDB_SUCCESS) {
258                 return NTDB_ERR_TO_OFF(ecode);
259         }
260         return ret;
261 }
262
263 /* write a lump of data at a specified offset */
264 static enum NTDB_ERROR ntdb_write(struct ntdb_context *ntdb, ntdb_off_t off,
265                                 const void *buf, ntdb_len_t len)
266 {
267         enum NTDB_ERROR ecode;
268
269         if (ntdb->flags & NTDB_RDONLY) {
270                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
271                                   "Write to read-only database");
272         }
273
274         ecode = ntdb->io->oob(ntdb, off, len, false);
275         if (ecode != NTDB_SUCCESS) {
276                 return ecode;
277         }
278
279         if (ntdb->file->map_ptr) {
280                 memcpy(off + (char *)ntdb->file->map_ptr, buf, len);
281         } else {
282 #ifdef HAVE_INCOHERENT_MMAP
283                 return NTDB_ERR_IO;
284 #else
285                 ssize_t ret;
286                 ret = pwrite(ntdb->file->fd, buf, len, off);
287                 if (ret != len) {
288                         /* This shouldn't happen: we avoid sparse files. */
289                         if (ret >= 0)
290                                 errno = ENOSPC;
291
292                         return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
293                                           "ntdb_write: %zi at %zu len=%zu (%s)",
294                                           ret, (size_t)off, (size_t)len,
295                                           strerror(errno));
296                 }
297 #endif
298         }
299         return NTDB_SUCCESS;
300 }
301
302 /* read a lump of data at a specified offset */
303 static enum NTDB_ERROR ntdb_read(struct ntdb_context *ntdb, ntdb_off_t off,
304                                void *buf, ntdb_len_t len)
305 {
306         enum NTDB_ERROR ecode;
307
308         ecode = ntdb->io->oob(ntdb, off, len, false);
309         if (ecode != NTDB_SUCCESS) {
310                 return ecode;
311         }
312
313         if (ntdb->file->map_ptr) {
314                 memcpy(buf, off + (char *)ntdb->file->map_ptr, len);
315         } else {
316 #ifdef HAVE_INCOHERENT_MMAP
317                 return NTDB_ERR_IO;
318 #else
319                 ssize_t r = pread(ntdb->file->fd, buf, len, off);
320                 if (r != len) {
321                         return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
322                                           "ntdb_read failed with %zi at %zu "
323                                           "len=%zu (%s) map_size=%zu",
324                                           r, (size_t)off, (size_t)len,
325                                           strerror(errno),
326                                           (size_t)ntdb->file->map_size);
327                 }
328 #endif
329         }
330         return NTDB_SUCCESS;
331 }
332
333 enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
334                                  const void *rec, size_t len)
335 {
336         enum NTDB_ERROR ecode;
337
338         if (unlikely((ntdb->flags & NTDB_CONVERT))) {
339                 void *conv = malloc(len);
340                 if (!conv) {
341                         return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
342                                           "ntdb_write: no memory converting"
343                                           " %zu bytes", len);
344                 }
345                 memcpy(conv, rec, len);
346                 ecode = ntdb->io->twrite(ntdb, off,
347                                         ntdb_convert(ntdb, conv, len), len);
348                 free(conv);
349         } else {
350                 ecode = ntdb->io->twrite(ntdb, off, rec, len);
351         }
352         return ecode;
353 }
354
355 enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
356                                 void *rec, size_t len)
357 {
358         enum NTDB_ERROR ecode = ntdb->io->tread(ntdb, off, rec, len);
359         ntdb_convert(ntdb, rec, len);
360         return ecode;
361 }
362
363 enum NTDB_ERROR ntdb_write_off(struct ntdb_context *ntdb,
364                              ntdb_off_t off, ntdb_off_t val)
365 {
366         if (ntdb->flags & NTDB_RDONLY) {
367                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
368                                   "Write to read-only database");
369         }
370
371         if (likely(!(ntdb->flags & NTDB_CONVERT))) {
372                 ntdb_off_t *p = ntdb->io->direct(ntdb, off, sizeof(*p), true);
373                 if (NTDB_PTR_IS_ERR(p)) {
374                         return NTDB_PTR_ERR(p);
375                 }
376                 if (p) {
377                         *p = val;
378                         return NTDB_SUCCESS;
379                 }
380         }
381         return ntdb_write_convert(ntdb, off, &val, sizeof(val));
382 }
383
384 static void *_ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset,
385                              ntdb_len_t len, unsigned int prefix)
386 {
387         unsigned char *buf;
388         enum NTDB_ERROR ecode;
389
390         /* some systems don't like zero length malloc */
391         buf = malloc(prefix + len ? prefix + len : 1);
392         if (!buf) {
393                 ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_USE_ERROR,
394                            "ntdb_alloc_read malloc failed len=%zu",
395                            (size_t)(prefix + len));
396                 return NTDB_ERR_PTR(NTDB_ERR_OOM);
397         } else {
398                 ecode = ntdb->io->tread(ntdb, offset, buf+prefix, len);
399                 if (unlikely(ecode != NTDB_SUCCESS)) {
400                         free(buf);
401                         return NTDB_ERR_PTR(ecode);
402                 }
403         }
404         return buf;
405 }
406
407 /* read a lump of data, allocating the space for it */
408 void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len)
409 {
410         return _ntdb_alloc_read(ntdb, offset, len, 0);
411 }
412
413 static enum NTDB_ERROR fill(struct ntdb_context *ntdb,
414                            const void *buf, size_t size,
415                            ntdb_off_t off, ntdb_len_t len)
416 {
417         while (len) {
418                 size_t n = len > size ? size : len;
419                 ssize_t ret = pwrite(ntdb->file->fd, buf, n, off);
420                 if (ret != n) {
421                         if (ret >= 0)
422                                 errno = ENOSPC;
423
424                         return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
425                                           "fill failed:"
426                                           " %zi at %zu len=%zu (%s)",
427                                           ret, (size_t)off, (size_t)len,
428                                           strerror(errno));
429                 }
430                 len -= n;
431                 off += n;
432         }
433         return NTDB_SUCCESS;
434 }
435
436 /* expand a file.  we prefer to use ftruncate, as that is what posix
437   says to use for mmap expansion */
438 static enum NTDB_ERROR ntdb_expand_file(struct ntdb_context *ntdb,
439                                       ntdb_len_t addition)
440 {
441         char buf[8192];
442         enum NTDB_ERROR ecode;
443
444         assert((ntdb->file->map_size + addition) % NTDB_PGSIZE == 0);
445         if (ntdb->flags & NTDB_RDONLY) {
446                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
447                                   "Expand on read-only database");
448         }
449
450         if (ntdb->flags & NTDB_INTERNAL) {
451                 char *new = realloc(ntdb->file->map_ptr,
452                                     ntdb->file->map_size + addition);
453                 if (!new) {
454                         return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
455                                           "No memory to expand database");
456                 }
457                 ntdb->file->map_ptr = new;
458                 ntdb->file->map_size += addition;
459                 return NTDB_SUCCESS;
460         } else {
461                 /* Unmap before trying to write; old NTDB claimed OpenBSD had
462                  * problem with this otherwise. */
463                 ntdb_munmap(ntdb->file);
464
465                 /* If this fails, we try to fill anyway. */
466                 if (ftruncate(ntdb->file->fd, ntdb->file->map_size + addition))
467                         ;
468
469                 /* now fill the file with something. This ensures that the
470                    file isn't sparse, which would be very bad if we ran out of
471                    disk. This must be done with write, not via mmap */
472                 memset(buf, 0x43, sizeof(buf));
473                 ecode = fill(ntdb, buf, sizeof(buf), ntdb->file->map_size,
474                              addition);
475                 if (ecode != NTDB_SUCCESS)
476                         return ecode;
477                 ntdb->file->map_size += addition;
478                 return ntdb_mmap(ntdb);
479         }
480 }
481
482 const void *ntdb_access_read(struct ntdb_context *ntdb,
483                             ntdb_off_t off, ntdb_len_t len, bool convert)
484 {
485         void *ret = NULL;
486
487         if (likely(!(ntdb->flags & NTDB_CONVERT))) {
488                 ret = ntdb->io->direct(ntdb, off, len, false);
489
490                 if (NTDB_PTR_IS_ERR(ret)) {
491                         return ret;
492                 }
493         }
494         if (!ret) {
495                 struct ntdb_access_hdr *hdr;
496                 hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
497                 if (NTDB_PTR_IS_ERR(hdr)) {
498                         return hdr;
499                 }
500                 hdr->next = ntdb->access;
501                 ntdb->access = hdr;
502                 ret = hdr + 1;
503                 if (convert) {
504                         ntdb_convert(ntdb, (void *)ret, len);
505                 }
506         } else
507                 ntdb->direct_access++;
508
509         return ret;
510 }
511
512 void *ntdb_access_write(struct ntdb_context *ntdb,
513                        ntdb_off_t off, ntdb_len_t len, bool convert)
514 {
515         void *ret = NULL;
516
517         if (ntdb->flags & NTDB_RDONLY) {
518                 ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
519                            "Write to read-only database");
520                 return NTDB_ERR_PTR(NTDB_ERR_RDONLY);
521         }
522
523         if (likely(!(ntdb->flags & NTDB_CONVERT))) {
524                 ret = ntdb->io->direct(ntdb, off, len, true);
525
526                 if (NTDB_PTR_IS_ERR(ret)) {
527                         return ret;
528                 }
529         }
530
531         if (!ret) {
532                 struct ntdb_access_hdr *hdr;
533                 hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
534                 if (NTDB_PTR_IS_ERR(hdr)) {
535                         return hdr;
536                 }
537                 hdr->next = ntdb->access;
538                 ntdb->access = hdr;
539                 hdr->off = off;
540                 hdr->len = len;
541                 hdr->convert = convert;
542                 ret = hdr + 1;
543                 if (convert)
544                         ntdb_convert(ntdb, (void *)ret, len);
545         } else
546                 ntdb->direct_access++;
547
548         return ret;
549 }
550
551 static struct ntdb_access_hdr **find_hdr(struct ntdb_context *ntdb, const void *p)
552 {
553         struct ntdb_access_hdr **hp;
554
555         for (hp = &ntdb->access; *hp; hp = &(*hp)->next) {
556                 if (*hp + 1 == p)
557                         return hp;
558         }
559         return NULL;
560 }
561
562 void ntdb_access_release(struct ntdb_context *ntdb, const void *p)
563 {
564         struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
565
566         if (hp) {
567                 hdr = *hp;
568                 *hp = hdr->next;
569                 free(hdr);
570         } else
571                 ntdb->direct_access--;
572 }
573
574 enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p)
575 {
576         struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
577         enum NTDB_ERROR ecode;
578
579         if (hp) {
580                 hdr = *hp;
581                 if (hdr->convert)
582                         ecode = ntdb_write_convert(ntdb, hdr->off, p, hdr->len);
583                 else
584                         ecode = ntdb_write(ntdb, hdr->off, p, hdr->len);
585                 *hp = hdr->next;
586                 free(hdr);
587         } else {
588                 ntdb->direct_access--;
589                 ecode = NTDB_SUCCESS;
590         }
591
592         return ecode;
593 }
594
595 static void *ntdb_direct(struct ntdb_context *ntdb, ntdb_off_t off, size_t len,
596                         bool write_mode)
597 {
598         enum NTDB_ERROR ecode;
599
600         if (unlikely(!ntdb->file->map_ptr))
601                 return NULL;
602
603         ecode = ntdb_oob(ntdb, off, len, false);
604         if (unlikely(ecode != NTDB_SUCCESS))
605                 return NTDB_ERR_PTR(ecode);
606         return (char *)ntdb->file->map_ptr + off;
607 }
608
609 void ntdb_inc_seqnum(struct ntdb_context *ntdb)
610 {
611         ntdb_off_t seq;
612
613         if (likely(!(ntdb->flags & NTDB_CONVERT))) {
614                 int64_t *direct;
615
616                 direct = ntdb->io->direct(ntdb,
617                                          offsetof(struct ntdb_header, seqnum),
618                                          sizeof(*direct), true);
619                 if (likely(direct)) {
620                         /* Don't let it go negative, even briefly */
621                         if (unlikely((*direct) + 1) < 0)
622                                 *direct = 0;
623                         (*direct)++;
624                         return;
625                 }
626         }
627
628         seq = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
629         if (!NTDB_OFF_IS_ERR(seq)) {
630                 seq++;
631                 if (unlikely((int64_t)seq < 0))
632                         seq = 0;
633                 ntdb_write_off(ntdb, offsetof(struct ntdb_header, seqnum), seq);
634         }
635 }
636
637 static const struct ntdb_methods io_methods = {
638         ntdb_read,
639         ntdb_write,
640         ntdb_oob,
641         ntdb_expand_file,
642         ntdb_direct,
643 };
644
645 /*
646   initialise the default methods table
647 */
648 void ntdb_io_init(struct ntdb_context *ntdb)
649 {
650         ntdb->io = &io_methods;
651 }