2 Trivial Database 2: opening and closing TDBs
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/build_assert/build_assert.h>
22 /* all tdbs, to detect double-opens (fcntl file don't nest!) */
23 static struct ntdb_context *tdbs = NULL;
25 static struct ntdb_file *find_file(dev_t device, ino_t ino)
27 struct ntdb_context *i;
29 for (i = tdbs; i; i = i->next) {
30 if (i->file->device == device && i->file->inode == ino) {
38 static bool read_all(int fd, void *buf, size_t len)
42 ret = read(fd, buf, len);
50 buf = (char *)buf + ret;
56 static uint64_t random_number(struct ntdb_context *ntdb)
62 fd = open("/dev/urandom", O_RDONLY);
64 if (read_all(fd, &ret, sizeof(ret))) {
70 /* FIXME: Untested! Based on Wikipedia protocol description! */
71 fd = open("/dev/egd-pool", O_RDWR);
73 /* Command is 1, next byte is size we want to read. */
74 char cmd[2] = { 1, sizeof(uint64_t) };
75 if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
76 char reply[1 + sizeof(uint64_t)];
77 int r = read(fd, reply, sizeof(reply));
79 /* Copy at least some bytes. */
80 memcpy(&ret, reply+1, r - 1);
81 if (reply[0] == sizeof(uint64_t)
82 && r == sizeof(reply)) {
91 /* Fallback: pid and time. */
92 gettimeofday(&now, NULL);
93 ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
94 ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
95 "ntdb_open: random from getpid and time");
99 static void ntdb_context_init(struct ntdb_context *ntdb)
101 /* Initialize the NTDB fields here */
103 ntdb->direct_access = 0;
104 ntdb->transaction = NULL;
108 struct new_database {
109 struct ntdb_header hdr;
110 struct ntdb_freetable ftable;
113 /* initialise a new database */
114 static enum NTDB_ERROR ntdb_new_database(struct ntdb_context *ntdb,
115 struct ntdb_attribute_seed *seed,
116 struct ntdb_header *hdr)
118 /* We make it up in memory, then write it out if not internal */
119 struct new_database newdb;
120 unsigned int magic_len;
122 enum NTDB_ERROR ecode;
124 /* Fill in the header */
125 newdb.hdr.version = NTDB_VERSION;
127 newdb.hdr.hash_seed = seed->seed;
129 newdb.hdr.hash_seed = random_number(ntdb);
130 newdb.hdr.hash_test = NTDB_HASH_MAGIC;
131 newdb.hdr.hash_test = ntdb->hash_fn(&newdb.hdr.hash_test,
132 sizeof(newdb.hdr.hash_test),
135 newdb.hdr.recovery = 0;
136 newdb.hdr.features_used = newdb.hdr.features_offered = NTDB_FEATURE_MASK;
137 newdb.hdr.seqnum = 0;
138 newdb.hdr.capabilities = 0;
139 memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
140 /* Initial hashes are empty. */
141 memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
144 newdb.hdr.free_table = offsetof(struct new_database, ftable);
145 memset(&newdb.ftable, 0, sizeof(newdb.ftable));
146 ecode = set_header(NULL, &newdb.ftable.hdr, NTDB_FTABLE_MAGIC, 0,
147 sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
148 sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
150 if (ecode != NTDB_SUCCESS) {
155 memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
156 strcpy(newdb.hdr.magic_food, NTDB_MAGIC_FOOD);
158 /* This creates an endian-converted database, as if read from disk */
159 magic_len = sizeof(newdb.hdr.magic_food);
161 (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
165 if (ntdb->flags & NTDB_INTERNAL) {
166 ntdb->file->map_size = sizeof(newdb);
167 ntdb->file->map_ptr = malloc(ntdb->file->map_size);
168 if (!ntdb->file->map_ptr) {
169 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
171 " failed to allocate");
173 memcpy(ntdb->file->map_ptr, &newdb, ntdb->file->map_size);
176 if (lseek(ntdb->file->fd, 0, SEEK_SET) == -1) {
177 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
179 " failed to seek: %s", strerror(errno));
182 if (ftruncate(ntdb->file->fd, 0) == -1) {
183 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
185 " failed to truncate: %s", strerror(errno));
188 rlen = write(ntdb->file->fd, &newdb, sizeof(newdb));
189 if (rlen != sizeof(newdb)) {
192 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
193 "ntdb_new_database: %zi writing header: %s",
194 rlen, strerror(errno));
199 static enum NTDB_ERROR ntdb_new_file(struct ntdb_context *ntdb)
201 ntdb->file = malloc(sizeof(*ntdb->file));
203 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
204 "ntdb_open: cannot alloc ntdb_file structure");
205 ntdb->file->num_lockrecs = 0;
206 ntdb->file->lockrecs = NULL;
207 ntdb->file->allrecord_lock.count = 0;
208 ntdb->file->refcnt = 1;
209 ntdb->file->map_ptr = NULL;
213 _PUBLIC_ enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
214 const union ntdb_attribute *attr)
216 switch (attr->base.attr) {
217 case NTDB_ATTRIBUTE_LOG:
218 ntdb->log_fn = attr->log.fn;
219 ntdb->log_data = attr->log.data;
221 case NTDB_ATTRIBUTE_HASH:
222 case NTDB_ATTRIBUTE_SEED:
223 case NTDB_ATTRIBUTE_OPENHOOK:
224 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
226 "ntdb_set_attribute:"
227 " cannot set %s after opening",
228 attr->base.attr == NTDB_ATTRIBUTE_HASH
229 ? "NTDB_ATTRIBUTE_HASH"
230 : attr->base.attr == NTDB_ATTRIBUTE_SEED
231 ? "NTDB_ATTRIBUTE_SEED"
232 : "NTDB_ATTRIBUTE_OPENHOOK");
233 case NTDB_ATTRIBUTE_STATS:
234 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
236 "ntdb_set_attribute:"
237 " cannot set NTDB_ATTRIBUTE_STATS");
238 case NTDB_ATTRIBUTE_FLOCK:
239 ntdb->lock_fn = attr->flock.lock;
240 ntdb->unlock_fn = attr->flock.unlock;
241 ntdb->lock_data = attr->flock.data;
244 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
246 "ntdb_set_attribute:"
247 " unknown attribute type %u",
253 _PUBLIC_ enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
254 union ntdb_attribute *attr)
256 switch (attr->base.attr) {
257 case NTDB_ATTRIBUTE_LOG:
259 return NTDB_ERR_NOEXIST;
260 attr->log.fn = ntdb->log_fn;
261 attr->log.data = ntdb->log_data;
263 case NTDB_ATTRIBUTE_HASH:
264 attr->hash.fn = ntdb->hash_fn;
265 attr->hash.data = ntdb->hash_data;
267 case NTDB_ATTRIBUTE_SEED:
268 attr->seed.seed = ntdb->hash_seed;
270 case NTDB_ATTRIBUTE_OPENHOOK:
272 return NTDB_ERR_NOEXIST;
273 attr->openhook.fn = ntdb->openhook;
274 attr->openhook.data = ntdb->openhook_data;
276 case NTDB_ATTRIBUTE_STATS: {
277 size_t size = attr->stats.size;
278 if (size > ntdb->stats.size)
279 size = ntdb->stats.size;
280 memcpy(&attr->stats, &ntdb->stats, size);
283 case NTDB_ATTRIBUTE_FLOCK:
284 attr->flock.lock = ntdb->lock_fn;
285 attr->flock.unlock = ntdb->unlock_fn;
286 attr->flock.data = ntdb->lock_data;
289 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
291 "ntdb_get_attribute:"
292 " unknown attribute type %u",
295 attr->base.next = NULL;
299 _PUBLIC_ void ntdb_unset_attribute(struct ntdb_context *ntdb,
300 enum ntdb_attribute_type type)
303 case NTDB_ATTRIBUTE_LOG:
306 case NTDB_ATTRIBUTE_OPENHOOK:
307 ntdb->openhook = NULL;
309 case NTDB_ATTRIBUTE_HASH:
310 case NTDB_ATTRIBUTE_SEED:
311 ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
312 "ntdb_unset_attribute: cannot unset %s after opening",
313 type == NTDB_ATTRIBUTE_HASH
314 ? "NTDB_ATTRIBUTE_HASH"
315 : "NTDB_ATTRIBUTE_SEED");
317 case NTDB_ATTRIBUTE_STATS:
318 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
320 "ntdb_unset_attribute:"
321 "cannot unset NTDB_ATTRIBUTE_STATS");
323 case NTDB_ATTRIBUTE_FLOCK:
324 ntdb->lock_fn = ntdb_fcntl_lock;
325 ntdb->unlock_fn = ntdb_fcntl_unlock;
328 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
330 "ntdb_unset_attribute: unknown attribute type %u",
335 /* The top three bits of the capability tell us whether it matters. */
336 enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
339 if (type & NTDB_CAP_NOOPEN) {
340 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
341 "%s: file has unknown capability %llu",
342 caller, type & NTDB_CAP_NOOPEN);
345 if ((type & NTDB_CAP_NOWRITE) && !(ntdb->flags & NTDB_RDONLY)) {
346 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_ERROR,
347 "%s: file has unknown capability %llu"
348 " (cannot write to it)",
349 caller, type & NTDB_CAP_NOOPEN);
352 if (type & NTDB_CAP_NOCHECK) {
353 ntdb->flags |= NTDB_CANT_CHECK;
358 static enum NTDB_ERROR capabilities_ok(struct ntdb_context *ntdb,
359 ntdb_off_t capabilities)
361 ntdb_off_t off, next;
362 enum NTDB_ERROR ecode = NTDB_SUCCESS;
363 const struct ntdb_capability *cap;
365 /* Check capability list. */
366 for (off = capabilities; off && ecode == NTDB_SUCCESS; off = next) {
367 cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
368 if (NTDB_PTR_IS_ERR(cap)) {
369 return NTDB_PTR_ERR(cap);
372 switch (cap->type & NTDB_CAP_TYPE_MASK) {
373 /* We don't understand any capabilities (yet). */
375 ecode = unknown_capability(ntdb, "ntdb_open", cap->type);
378 ntdb_access_release(ntdb, cap);
383 _PUBLIC_ struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
384 int open_flags, mode_t mode,
385 union ntdb_attribute *attr)
387 struct ntdb_context *ntdb;
393 struct ntdb_header hdr;
394 struct ntdb_attribute_seed *seed = NULL;
396 enum NTDB_ERROR ecode;
399 ntdb = malloc(sizeof(*ntdb) + (name ? strlen(name) + 1 : 0));
405 /* Set name immediately for logging functions. */
407 ntdb->name = strcpy((char *)(ntdb + 1), name);
411 ntdb->flags = ntdb_flags;
413 ntdb->open_flags = open_flags;
415 ntdb->openhook = NULL;
416 ntdb->lock_fn = ntdb_fcntl_lock;
417 ntdb->unlock_fn = ntdb_fcntl_unlock;
418 ntdb->hash_fn = ntdb_jenkins_hash;
419 memset(&ntdb->stats, 0, sizeof(ntdb->stats));
420 ntdb->stats.base.attr = NTDB_ATTRIBUTE_STATS;
421 ntdb->stats.size = sizeof(ntdb->stats);
424 switch (attr->base.attr) {
425 case NTDB_ATTRIBUTE_HASH:
426 ntdb->hash_fn = attr->hash.fn;
427 ntdb->hash_data = attr->hash.data;
429 case NTDB_ATTRIBUTE_SEED:
432 case NTDB_ATTRIBUTE_OPENHOOK:
433 ntdb->openhook = attr->openhook.fn;
434 ntdb->openhook_data = attr->openhook.data;
437 /* These are set as normal. */
438 ecode = ntdb_set_attribute(ntdb, attr);
439 if (ecode != NTDB_SUCCESS)
442 attr = attr->base.next;
445 if (ntdb_flags & ~(NTDB_INTERNAL | NTDB_NOLOCK | NTDB_NOMMAP | NTDB_CONVERT
446 | NTDB_NOSYNC | NTDB_SEQNUM | NTDB_ALLOW_NESTING
448 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
449 "ntdb_open: unknown flags %u", ntdb_flags);
454 if (!(ntdb_flags & NTDB_INTERNAL) && !(open_flags & O_CREAT)) {
455 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
458 " cannot set NTDB_ATTRIBUTE_SEED"
459 " without O_CREAT.");
464 if ((open_flags & O_ACCMODE) == O_WRONLY) {
465 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
466 "ntdb_open: can't open ntdb %s write-only",
471 if ((open_flags & O_ACCMODE) == O_RDONLY) {
473 ntdb->flags |= NTDB_RDONLY;
475 if (ntdb_flags & NTDB_RDONLY) {
476 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
478 "ntdb_open: can't use NTDB_RDONLY"
479 " without O_RDONLY");
485 /* internal databases don't need any of the rest. */
486 if (ntdb->flags & NTDB_INTERNAL) {
487 ntdb->flags |= (NTDB_NOLOCK | NTDB_NOMMAP);
488 ecode = ntdb_new_file(ntdb);
489 if (ecode != NTDB_SUCCESS) {
493 ecode = ntdb_new_database(ntdb, seed, &hdr);
494 if (ecode == NTDB_SUCCESS) {
495 ntdb_convert(ntdb, &hdr.hash_seed,
496 sizeof(hdr.hash_seed));
497 ntdb->hash_seed = hdr.hash_seed;
498 ntdb_context_init(ntdb);
499 ntdb_ftable_init(ntdb);
501 if (ecode != NTDB_SUCCESS) {
507 if (stat(name, &st) != -1)
508 ntdb->file = find_file(st.st_dev, st.st_ino);
513 if ((fd = open(name, open_flags, mode)) == -1) {
514 /* errno set by open(2) */
516 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
517 "ntdb_open: could not open file %s: %s",
518 name, strerror(errno));
522 /* on exec, don't inherit the fd */
523 v = fcntl(fd, F_GETFD, 0);
524 fcntl(fd, F_SETFD, v | FD_CLOEXEC);
526 if (fstat(fd, &st) == -1) {
528 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
529 "ntdb_open: could not stat open %s: %s",
530 name, strerror(errno));
535 ecode = ntdb_new_file(ntdb);
536 if (ecode != NTDB_SUCCESS) {
542 ntdb->file->device = st.st_dev;
543 ntdb->file->inode = st.st_ino;
544 ntdb->file->map_ptr = NULL;
545 ntdb->file->map_size = 0;
548 /* ensure there is only one process initialising at once */
549 ecode = ntdb_lock_open(ntdb, openlock, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
550 if (ecode != NTDB_SUCCESS) {
555 /* call their open hook if they gave us one. */
556 if (ntdb->openhook) {
557 ecode = ntdb->openhook(ntdb->file->fd, ntdb->openhook_data);
558 if (ecode != NTDB_SUCCESS) {
559 ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
560 "ntdb_open: open hook failed");
563 open_flags |= O_CREAT;
566 /* If they used O_TRUNC, read will return 0. */
567 rlen = pread(ntdb->file->fd, &hdr, sizeof(hdr), 0);
568 if (rlen == 0 && (open_flags & O_CREAT)) {
569 ecode = ntdb_new_database(ntdb, seed, &hdr);
570 if (ecode != NTDB_SUCCESS) {
573 } else if (rlen < 0) {
574 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
575 "ntdb_open: error %s reading %s",
576 strerror(errno), name);
578 } else if (rlen < sizeof(hdr)
579 || strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
580 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
581 "ntdb_open: %s is not a ntdb file", name);
585 if (hdr.version != NTDB_VERSION) {
586 if (hdr.version == bswap_64(NTDB_VERSION))
587 ntdb->flags |= NTDB_CONVERT;
590 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
592 " %s is unknown version 0x%llx",
593 name, (long long)hdr.version);
596 } else if (ntdb->flags & NTDB_CONVERT) {
597 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
599 " %s does not need NTDB_CONVERT",
604 ntdb_context_init(ntdb);
606 ntdb_convert(ntdb, &hdr, sizeof(hdr));
607 ntdb->hash_seed = hdr.hash_seed;
608 hash_test = NTDB_HASH_MAGIC;
609 hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
610 if (hdr.hash_test != hash_test) {
611 /* wrong hash variant */
612 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
614 " %s uses a different hash function",
619 ecode = capabilities_ok(ntdb, hdr.capabilities);
620 if (ecode != NTDB_SUCCESS) {
624 /* Clear any features we don't understand. */
625 if ((open_flags & O_ACCMODE) != O_RDONLY) {
626 hdr.features_used &= NTDB_FEATURE_MASK;
627 ecode = ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
630 sizeof(hdr.features_used));
631 if (ecode != NTDB_SUCCESS)
635 ntdb_unlock_open(ntdb, openlock);
637 /* This makes sure we have current map_size and mmap. */
638 ecode = ntdb->io->oob(ntdb, ntdb->file->map_size, 1, true);
639 if (unlikely(ecode != NTDB_SUCCESS))
642 /* Now it's fully formed, recover if necessary. */
643 berr = ntdb_needs_recovery(ntdb);
644 if (unlikely(berr != false)) {
646 ecode = NTDB_OFF_TO_ERR(berr);
649 ecode = ntdb_lock_and_recover(ntdb);
650 if (ecode != NTDB_SUCCESS) {
655 ecode = ntdb_ftable_init(ntdb);
656 if (ecode != NTDB_SUCCESS) {
665 /* Map ecode to some logical errno. */
666 switch (NTDB_ERR_TO_OFF(ecode)) {
667 case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT):
668 case NTDB_ERR_TO_OFF(NTDB_ERR_IO):
671 case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK):
672 saved_errno = EWOULDBLOCK;
674 case NTDB_ERR_TO_OFF(NTDB_ERR_OOM):
675 saved_errno = ENOMEM;
677 case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL):
678 saved_errno = EINVAL;
681 saved_errno = EINVAL;
687 close(ntdb->tracefd);
690 ntdb_lock_cleanup(ntdb);
691 if (--ntdb->file->refcnt == 0) {
692 assert(ntdb->file->num_lockrecs == 0);
693 if (ntdb->file->map_ptr) {
694 if (ntdb->flags & NTDB_INTERNAL) {
695 free(ntdb->file->map_ptr);
697 ntdb_munmap(ntdb->file);
699 if (close(ntdb->file->fd) != 0)
700 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
701 "ntdb_open: failed to close ntdb fd"
702 " on error: %s", strerror(errno));
703 free(ntdb->file->lockrecs);
713 _PUBLIC_ int ntdb_close(struct ntdb_context *ntdb)
716 struct ntdb_context **i;
718 ntdb_trace(ntdb, "ntdb_close");
720 if (ntdb->transaction) {
721 ntdb_transaction_cancel(ntdb);
724 if (ntdb->file->map_ptr) {
725 if (ntdb->flags & NTDB_INTERNAL)
726 free(ntdb->file->map_ptr);
728 ntdb_munmap(ntdb->file);
731 ntdb_lock_cleanup(ntdb);
732 if (--ntdb->file->refcnt == 0) {
733 ret = close(ntdb->file->fd);
734 free(ntdb->file->lockrecs);
739 /* Remove from tdbs list */
740 for (i = &tdbs; *i; i = &(*i)->next) {
748 close(ntdb->tracefd);
755 _PUBLIC_ void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p)
757 struct ntdb_context *i;
759 for (i = tdbs; i; i = i->next) {