2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 2005
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 2 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, write to the Free Software
24 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 #include "tdb_private.h"
32 - only allow a single transaction at a time per database. This makes
33 using the transaction API simpler, as otherwise the caller would
34 have to cope with temporary failures in transactions that conflict
35 with other current transactions
37 - keep the transaction recovery information in the same file as the
38 database, using a special 'transaction recovery' record pointed at
39 by the header. This removes the need for extra journal files as
40 used by some other databases
42 - dymacially allocated the transaction recover record, re-using it
43 for subsequent transactions. If a larger record is needed then
44 tdb_free() the old record to place it on the normal tdb freelist
45 before allocating the new record
47 - during transactions, keep a linked list of writes all that have
48 been performed by intercepting all tdb_write() calls. The hooked
49 transaction versions of tdb_read() and tdb_write() check this
50 linked list and try to use the elements of the list in preference
53 - don't allow any locks to be held when a transaction starts,
54 otherwise we can end up with deadlock (plus lack of lock nesting
55 in posix locks would mean the lock is lost)
57 - if the caller gains a lock during the transaction but doesn't
58 release it then fail the commit
60 - allow for nested calls to tdb_transaction_start(), re-using the
61 existing transaction record. If the inner transaction is cancelled
62 then a subsequent commit will fail
64 - keep a mirrored copy of the tdb hash chain heads to allow for the
65 fast hash heads scan on traverse, updating the mirrored copy in
66 the transaction version of tdb_write
68 - allow callers to mix transaction and non-transaction use of tdb,
69 although once a transaction is started then an exclusive lock is
70 gained until the transaction is committed or cancelled
72 - the commit stategy involves first saving away all modified data
73 into a linearised buffer in the transaction recovery area, then
74 marking the transaction recovery area with a magic value to
75 indicate a valid recovery record. In total 4 fsync/msync calls are
76 needed per commit to prevent race conditions. It might be possible
77 to reduce this to 3 or even 2 with some more work.
79 - check for a valid recovery record on open of the tdb, while the
80 global lock is held. Automatically recover from the transaction
81 recovery area if needed, then continue with the open as
82 usual. This allows for smooth crash recovery with no administrator
85 - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
86 still available, but no transaction recovery area is used and no
87 fsync/msync calls are made.
93 hold the context of any current transaction
95 struct tdb_transaction {
96 /* we keep a mirrored copy of the tdb hash heads here so
97 tdb_next_hash_chain() can operate efficiently */
100 /* the original io methods - used to do IOs to the real db */
101 const struct tdb_methods *io_methods;
103 /* the list of transaction elements. We use a doubly linked
104 list with a last pointer to allow us to keep the list
105 ordered, with first element at the front of the list. It
106 needs to be doubly linked as the read/write traversals need
107 to be backwards, while the commit needs to be forwards */
108 struct tdb_transaction_el {
109 struct tdb_transaction_el *next, *prev;
113 } *elements, *elements_last;
115 /* non-zero when an internal transaction error has
116 occurred. All write operations will then fail until the
117 transaction is ended */
118 int transaction_error;
120 /* when inside a transaction we need to keep track of any
121 nested tdb_transaction_start() calls, as these are allowed,
122 but don't create a new transaction */
125 /* old file size before transaction */
126 tdb_len_t old_map_size;
131 read while in a transaction. We need to check first if the data is in our list
132 of transaction elements, then if not do a real read
134 static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
135 tdb_len_t len, int cv)
137 struct tdb_transaction_el *el;
139 /* we need to walk the list backwards to get the most recent data */
140 for (el=tdb->transaction->elements_last;el;el=el->prev) {
143 if (off+len <= el->offset) {
146 if (off >= el->offset + el->length) {
150 /* an overlapping read - needs to be split into up to
151 2 reads and a memcpy */
152 if (off < el->offset) {
153 partial = el->offset - off;
154 if (transaction_read(tdb, off, buf, partial, cv) != 0) {
159 buf = (void *)(partial + (char *)buf);
161 if (off + len <= el->offset + el->length) {
164 partial = el->offset + el->length - off;
166 memcpy(buf, el->data + (off - el->offset), partial);
168 tdb_convert(buf, len);
172 buf = (void *)(partial + (char *)buf);
174 if (len != 0 && transaction_read(tdb, off, buf, len, cv) != 0) {
181 /* its not in the transaction elements - do a real read */
182 return tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv);
185 TDB_LOG((tdb, 0, "transaction_read: failed at off=%d len=%d\n", off, len));
186 tdb->ecode = TDB_ERR_IO;
187 tdb->transaction->transaction_error = 1;
193 write while in a transaction
195 static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
196 const void *buf, tdb_len_t len)
198 struct tdb_transaction_el *el;
200 /* if the write is to a hash head, then update the transaction
202 if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
203 off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
204 u32 chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
205 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
208 /* first see if we can replace an existing entry */
209 for (el=tdb->transaction->elements_last;el;el=el->prev) {
212 if (off+len <= el->offset) {
215 if (off >= el->offset + el->length) {
219 /* an overlapping write - needs to be split into up to
220 2 writes and a memcpy */
221 if (off < el->offset) {
222 partial = el->offset - off;
223 if (transaction_write(tdb, off, buf, partial) != 0) {
228 buf = (const void *)(partial + (const char *)buf);
230 if (off + len <= el->offset + el->length) {
233 partial = el->offset + el->length - off;
235 memcpy(el->data + (off - el->offset), buf, partial);
238 buf = (const void *)(partial + (const char *)buf);
240 if (len != 0 && transaction_write(tdb, off, buf, len) != 0) {
247 /* add a new entry at the end of the list */
248 el = malloc(sizeof(*el));
250 tdb->ecode = TDB_ERR_OOM;
251 tdb->transaction->transaction_error = 1;
255 el->prev = tdb->transaction->elements_last;
258 el->data = malloc(len);
259 if (el->data == NULL) {
261 tdb->ecode = TDB_ERR_OOM;
262 tdb->transaction->transaction_error = 1;
266 memcpy(el->data, buf, len);
268 memset(el->data, TDB_PAD_BYTE, len);
273 tdb->transaction->elements = el;
275 tdb->transaction->elements_last = el;
279 TDB_LOG((tdb, 0, "transaction_write: failed at off=%d len=%d\n", off, len));
280 tdb->ecode = TDB_ERR_IO;
281 tdb->transaction->transaction_error = 1;
286 accelerated hash chain head search, using the cached hash heads
288 static void transaction_next_hash_chain(struct tdb_context *tdb, u32 *chain)
291 for (;h < tdb->header.hash_size;h++) {
292 /* the +1 takes account of the freelist */
293 if (0 != tdb->transaction->hash_heads[h+1]) {
301 out of bounds check during a transaction
303 static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
305 if (len <= tdb->map_size) {
308 return TDB_ERRCODE(TDB_ERR_IO, -1);
312 transaction version of tdb_expand().
314 static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
317 /* add a write to the transaction elements, so subsequent
318 reads see the zero data */
319 if (transaction_write(tdb, size, NULL, addition) != 0) {
327 brlock during a transaction - ignore them
329 int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
330 int rw_type, int lck_type, int probe)
335 static const struct tdb_methods transaction_methods = {
336 .tdb_read = transaction_read,
337 .tdb_write = transaction_write,
338 .next_hash_chain = transaction_next_hash_chain,
339 .tdb_oob = transaction_oob,
340 .tdb_expand_file = transaction_expand_file,
341 .tdb_brlock = transaction_brlock
346 start a tdb transaction. No token is returned, as only a single
347 transaction is allowed to be pending per tdb_context
349 int tdb_transaction_start(struct tdb_context *tdb)
351 /* some sanity checks */
352 if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
353 TDB_LOG((tdb, 0, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
354 tdb->ecode = TDB_ERR_EINVAL;
358 /* cope with nested tdb_transaction_start() calls */
359 if (tdb->transaction != NULL) {
360 tdb->transaction->nesting++;
361 TDB_LOG((tdb, 0, "tdb_transaction_start: nesting %d\n",
362 tdb->transaction->nesting));
366 if (tdb->num_locks != 0) {
367 /* the caller must not have any locks when starting a
368 transaction as otherwise we'll be screwed by lack
369 of nested locks in posix */
370 TDB_LOG((tdb, 0, "tdb_transaction_start: cannot start a transaction with locks held\n"));
371 tdb->ecode = TDB_ERR_LOCK;
375 tdb->transaction = calloc(sizeof(struct tdb_transaction), 1);
376 if (tdb->transaction == NULL) {
377 tdb->ecode = TDB_ERR_OOM;
381 /* get the transaction write lock. This is a blocking lock. As
382 discussed with Volker, there are a number of ways we could
383 make this async, which we will probably do in the future */
384 if (tdb_brlock_len(tdb, TRANSACTION_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
385 TDB_LOG((tdb, 0, "tdb_transaction_start: failed to get transaction lock\n"));
386 tdb->ecode = TDB_ERR_LOCK;
387 SAFE_FREE(tdb->transaction);
391 /* get a write lock from the freelist to the end of file. It
392 would be much better to make this a read lock as it would
393 increase parallelism, but it could lead to deadlocks on
394 commit when a write lock needs to be taken.
396 TODO: look at alternative locking strategies to allow this
399 if (tdb_brlock_len(tdb, FREELIST_TOP, F_WRLCK, F_SETLKW, 0, 0) == -1) {
400 TDB_LOG((tdb, 0, "tdb_transaction_start: failed to get hash locks\n"));
401 tdb->ecode = TDB_ERR_LOCK;
405 /* setup a copy of the hash table heads so the hash scan in
406 traverse can be fast */
407 tdb->transaction->hash_heads = calloc(tdb->header.hash_size+1, sizeof(tdb_off_t));
408 if (tdb->transaction->hash_heads == NULL) {
409 tdb->ecode = TDB_ERR_OOM;
412 if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
413 TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
414 TDB_LOG((tdb, 0, "tdb_transaction_start: failed to read hash heads\n"));
415 tdb->ecode = TDB_ERR_IO;
419 /* make sure we know about any file expansions already done by
421 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
422 tdb->transaction->old_map_size = tdb->map_size;
424 /* finally hook the io methods, replacing them with
425 transaction specific methods */
426 tdb->transaction->io_methods = tdb->methods;
427 tdb->methods = &transaction_methods;
432 tdb_brlock_len(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
433 tdb_brlock_len(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
434 SAFE_FREE(tdb->transaction->hash_heads);
435 SAFE_FREE(tdb->transaction);
441 cancel the current transaction
443 int tdb_transaction_cancel(struct tdb_context *tdb)
445 if (tdb->transaction == NULL) {
446 TDB_LOG((tdb, 0, "tdb_transaction_cancel: no transaction\n"));
450 if (tdb->transaction->nesting != 0) {
451 tdb->transaction->transaction_error = 1;
452 tdb->transaction->nesting--;
456 tdb->map_size = tdb->transaction->old_map_size;
458 /* free all the transaction elements */
459 while (tdb->transaction->elements) {
460 struct tdb_transaction_el *el = tdb->transaction->elements;
461 tdb->transaction->elements = el->next;
466 /* remove any locks created during the transaction */
467 if (tdb->num_locks != 0) {
469 for (h=0;h<tdb->header.hash_size+1;h++) {
470 if (tdb->locked[h].count != 0) {
471 tdb_brlock_len(tdb,FREELIST_TOP+4*h,F_UNLCK,F_SETLKW, 0, 1);
472 tdb->locked[h].count = 0;
478 /* restore the normal io methods */
479 tdb->methods = tdb->transaction->io_methods;
481 tdb_brlock_len(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
482 tdb_brlock_len(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
483 SAFE_FREE(tdb->transaction->hash_heads);
484 SAFE_FREE(tdb->transaction);
492 static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
494 if (fsync(tdb->fd) != 0) {
495 tdb->ecode = TDB_ERR_IO;
496 TDB_LOG((tdb, 0, "tdb_transaction: fsync failed\n"));
501 tdb_off_t moffset = offset & ~(tdb->page_size-1);
502 if (msync(moffset + (char *)tdb->map_ptr,
503 length + (offset - moffset), MS_SYNC) != 0) {
504 tdb->ecode = TDB_ERR_IO;
505 TDB_LOG((tdb, 0, "tdb_transaction: msync failed\n"));
515 work out how much space the linearised recovery data will consume
517 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
519 struct tdb_transaction_el *el;
520 tdb_len_t recovery_size = 0;
522 recovery_size = sizeof(u32);
523 for (el=tdb->transaction->elements;el;el=el->next) {
524 if (el->offset >= tdb->transaction->old_map_size) {
527 recovery_size += 2*sizeof(tdb_off_t) + el->length;
530 return recovery_size;
534 allocate the recovery area, or use an existing recovery area if it is
537 static int tdb_recovery_allocate(struct tdb_context *tdb,
538 tdb_len_t *recovery_size,
539 tdb_off_t *recovery_offset,
540 tdb_len_t *recovery_max_size)
542 struct list_struct rec;
543 const struct tdb_methods *methods = tdb->transaction->io_methods;
544 tdb_off_t recovery_head;
546 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
547 TDB_LOG((tdb, 0, "tdb_recovery_allocate: failed to read recovery head\n"));
553 if (recovery_head != 0 &&
554 methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
555 TDB_LOG((tdb, 0, "tdb_recovery_allocate: failed to read recovery record\n"));
559 *recovery_size = tdb_recovery_size(tdb);
561 if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
562 /* it fits in the existing area */
563 *recovery_max_size = rec.rec_len;
564 *recovery_offset = recovery_head;
568 /* we need to free up the old recovery area, then allocate a
569 new one at the end of the file. Note that we cannot use
570 tdb_allocate() to allocate the new one as that might return
571 us an area that is being currently used (as of the start of
573 if (recovery_head != 0) {
574 if (tdb_free(tdb, recovery_head, &rec) == -1) {
575 TDB_LOG((tdb, 0, "tdb_recovery_allocate: failed to free previous recovery area\n"));
580 /* the tdb_free() call might have increased the recovery size */
581 *recovery_size = tdb_recovery_size(tdb);
583 /* round up to a multiple of page size */
584 *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
585 *recovery_offset = tdb->map_size;
586 recovery_head = *recovery_offset;
588 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
589 (tdb->map_size - tdb->transaction->old_map_size) +
590 sizeof(rec) + *recovery_max_size) == -1) {
591 TDB_LOG((tdb, 0, "tdb_recovery_allocate: failed to create recovery area\n"));
595 /* remap the file (if using mmap) */
596 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
598 /* we have to reset the old map size so that we don't try to expand the file
599 again in the transaction commit, which would destroy the recovery area */
600 tdb->transaction->old_map_size = tdb->map_size;
602 /* write the recovery header offset and sync - we can sync without a race here
603 as the magic ptr in the recovery record has not been set */
604 CONVERT(recovery_head);
605 if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
606 &recovery_head, sizeof(tdb_off_t)) == -1) {
607 TDB_LOG((tdb, 0, "tdb_recovery_allocate: failed to write recovery head\n"));
616 setup the recovery data that will be used on a crash during commit
618 static int transaction_setup_recovery(struct tdb_context *tdb,
619 tdb_off_t *magic_offset)
621 struct tdb_transaction_el *el;
622 tdb_len_t recovery_size;
623 unsigned char *data, *p;
624 const struct tdb_methods *methods = tdb->transaction->io_methods;
625 struct list_struct *rec;
626 tdb_off_t recovery_offset, recovery_max_size;
627 tdb_off_t old_map_size = tdb->transaction->old_map_size;
631 check that the recovery area has enough space
633 if (tdb_recovery_allocate(tdb, &recovery_size,
634 &recovery_offset, &recovery_max_size) == -1) {
638 data = malloc(recovery_size + sizeof(*rec));
640 tdb->ecode = TDB_ERR_OOM;
644 rec = (struct list_struct *)data;
645 memset(rec, 0, sizeof(*rec));
648 rec->data_len = recovery_size;
649 rec->rec_len = recovery_max_size;
650 rec->key_len = old_map_size;
653 /* build the recovery data into a single blob to allow us to do a single
654 large write, which should be more efficient */
655 p = data + sizeof(*rec);
656 for (el=tdb->transaction->elements;el;el=el->next) {
657 if (el->offset >= old_map_size) {
660 if (el->offset + el->length > tdb->transaction->old_map_size) {
661 TDB_LOG((tdb, 0, "tdb_transaction_commit: transaction data over new region boundary\n"));
663 tdb->ecode = TDB_ERR_CORRUPT;
666 ((u32 *)p)[0] = el->offset;
667 ((u32 *)p)[1] = el->length;
671 /* the recovery area contains the old data, not the
672 new data, so we have to call the original tdb_read
674 if (methods->tdb_read(tdb, el->offset, p + 8, el->length, 0) != 0) {
676 tdb->ecode = TDB_ERR_IO;
683 *(u32 *)p = sizeof(*rec) + recovery_max_size;
686 /* write the recovery data to the recovery area */
687 if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
688 TDB_LOG((tdb, 0, "tdb_transaction_commit: failed to write recovery data\n"));
690 tdb->ecode = TDB_ERR_IO;
694 /* as we don't have ordered writes, we have to sync the recovery
695 data before we update the magic to indicate that the recovery
697 if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
704 magic = TDB_RECOVERY_MAGIC;
707 *magic_offset = recovery_offset + offsetof(struct list_struct, magic);
709 if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
710 TDB_LOG((tdb, 0, "tdb_transaction_commit: failed to write recovery magic\n"));
711 tdb->ecode = TDB_ERR_IO;
715 /* ensure the recovery magic marker is on disk */
716 if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
724 commit the current transaction
726 int tdb_transaction_commit(struct tdb_context *tdb)
728 const struct tdb_methods *methods;
729 tdb_off_t magic_offset;
732 if (tdb->transaction == NULL) {
733 TDB_LOG((tdb, 0, "tdb_transaction_commit: no transaction\n"));
737 if (tdb->transaction->transaction_error) {
738 tdb->ecode = TDB_ERR_IO;
739 tdb_transaction_cancel(tdb);
740 TDB_LOG((tdb, 0, "tdb_transaction_commit: transaction error pending\n"));
744 if (tdb->transaction->nesting != 0) {
745 tdb->transaction->nesting--;
749 /* check for a null transaction */
750 if (tdb->transaction->elements == NULL) {
751 tdb_transaction_cancel(tdb);
755 methods = tdb->transaction->io_methods;
757 /* if there are any locks pending then the caller has not
758 nested their locks properly, so fail the transaction */
759 if (tdb->num_locks) {
760 tdb->ecode = TDB_ERR_LOCK;
761 TDB_LOG((tdb, 0, "tdb_transaction_commit: locks pending on commit\n"));
762 tdb_transaction_cancel(tdb);
766 /* get the global lock - this prevents new users attaching to the database
768 if (tdb_brlock_len(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
769 TDB_LOG((tdb, 0, "tdb_transaction_commit: failed to get global lock\n"));
770 tdb->ecode = TDB_ERR_LOCK;
771 tdb_transaction_cancel(tdb);
775 if (!(tdb->flags & TDB_NOSYNC)) {
776 /* write the recovery data to the end of the file */
777 if (transaction_setup_recovery(tdb, &magic_offset) == -1) {
778 TDB_LOG((tdb, 0, "tdb_transaction_commit: failed to setup recovery data\n"));
779 tdb_brlock_len(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
780 tdb_transaction_cancel(tdb);
785 /* expand the file to the new size if needed */
786 if (tdb->map_size != tdb->transaction->old_map_size) {
787 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
789 tdb->transaction->old_map_size) == -1) {
790 tdb->ecode = TDB_ERR_IO;
791 TDB_LOG((tdb, 0, "tdb_transaction_commit: expansion failed\n"));
792 tdb_brlock_len(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
793 tdb_transaction_cancel(tdb);
796 tdb->map_size = tdb->transaction->old_map_size;
797 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
800 /* perform all the writes */
801 while (tdb->transaction->elements) {
802 struct tdb_transaction_el *el = tdb->transaction->elements;
804 if (methods->tdb_write(tdb, el->offset, el->data, el->length) == -1) {
805 TDB_LOG((tdb, 0, "tdb_transaction_commit: write failed during commit\n"));
807 /* we've overwritten part of the data and
808 possibly expanded the file, so we need to
809 run the crash recovery code */
810 tdb->methods = methods;
811 tdb_transaction_recover(tdb);
813 tdb_transaction_cancel(tdb);
814 tdb_brlock_len(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
816 TDB_LOG((tdb, 0, "tdb_transaction_commit: write failed\n"));
819 tdb->transaction->elements = el->next;
824 if (!(tdb->flags & TDB_NOSYNC)) {
825 /* ensure the new data is on disk */
826 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
830 /* remove the recovery marker */
831 if (methods->tdb_write(tdb, magic_offset, &zero, 4) == -1) {
832 TDB_LOG((tdb, 0, "tdb_transaction_commit: failed to remove recovery magic\n"));
836 /* ensure the recovery marker has been removed on disk */
837 if (transaction_sync(tdb, magic_offset, 4) == -1) {
842 tdb_brlock_len(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
844 /* use a transaction cancel to free memory and remove the
846 tdb_transaction_cancel(tdb);
852 recover from an aborted transaction. Must be called with exclusive
853 database write access already established (including the global
854 lock to prevent new processes attaching)
856 int tdb_transaction_recover(struct tdb_context *tdb)
858 tdb_off_t recovery_head, recovery_eof;
859 unsigned char *data, *p;
861 struct list_struct rec;
863 /* find the recovery area */
864 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
865 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to read recovery head\n"));
866 tdb->ecode = TDB_ERR_IO;
870 if (recovery_head == 0) {
871 /* we have never allocated a recovery record */
875 /* read the recovery record */
876 if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
877 sizeof(rec), DOCONV()) == -1) {
878 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to read recovery record\n"));
879 tdb->ecode = TDB_ERR_IO;
883 if (rec.magic != TDB_RECOVERY_MAGIC) {
884 /* there is no valid recovery data */
888 if (tdb->read_only) {
889 TDB_LOG((tdb, 0, "tdb_transaction_recover: attempt to recover read only database\n"));
890 tdb->ecode = TDB_ERR_CORRUPT;
894 recovery_eof = rec.key_len;
896 data = malloc(rec.data_len);
898 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to allocate recovery data\n"));
899 tdb->ecode = TDB_ERR_OOM;
903 /* read the full recovery data */
904 if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
905 rec.data_len, 0) == -1) {
906 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to read recovery data\n"));
907 tdb->ecode = TDB_ERR_IO;
911 /* recover the file data */
913 while (p+8 < data + rec.data_len) {
921 if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
923 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
924 tdb->ecode = TDB_ERR_IO;
932 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
933 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to sync recovery\n"));
934 tdb->ecode = TDB_ERR_IO;
938 /* if the recovery area is after the recovered eof then remove it */
939 if (recovery_eof <= recovery_head) {
940 if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
941 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to remove recovery head\n"));
942 tdb->ecode = TDB_ERR_IO;
947 /* remove the recovery magic */
948 if (tdb_ofs_write(tdb, recovery_head + offsetof(struct list_struct, magic),
950 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to remove recovery magic\n"));
951 tdb->ecode = TDB_ERR_IO;
955 /* reduce the file size to the old size */
957 if (ftruncate(tdb->fd, recovery_eof) != 0) {
958 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to reduce to recovery size\n"));
959 tdb->ecode = TDB_ERR_IO;
962 tdb->map_size = recovery_eof;
965 if (transaction_sync(tdb, 0, recovery_eof) == -1) {
966 TDB_LOG((tdb, 0, "tdb_transaction_recover: failed to sync2 recovery\n"));
967 tdb->ecode = TDB_ERR_IO;
971 TDB_LOG((tdb, 0, "tdb_transaction_recover: recovered %d byte database\n",