Btrfs: Fix some data=ordered related data corruptions
authorChris Mason <chris.mason@oracle.com>
Tue, 22 Jul 2008 15:18:09 +0000 (11:18 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:05 +0000 (11:04 -0400)
Stress testing was showing data checksum errors, most of which were caused
by a lookup bug in the extent_map tree.  The tree was caching the last
pointer returned, and searches would check the last pointer first.

But, search callers also expect the search to return the very first
matching extent in the range, which wasn't always true with the last
pointer usage.

For now, the code to cache the last return value is just removed.  It is
easy to fix, but I think lookups are rare enough that it isn't required anymore.

This commit also replaces do_sync_mapping_range with a local copy of the
related functions.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.h
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/transaction.c

index 96ab2797c09a44a741a72b34ec05a1343d449a18..f8fccdac305546e3e27a3ca3387f01486b3fbe28 100644 (file)
@@ -1590,6 +1590,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *path,
                        u64 isize);
 /* inode.c */
+int btrfs_writepages(struct address_space *mapping,
+                    struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
                struct btrfs_trans_handle *trans, u64 new_dirid,
                struct btrfs_block_group_cache *block_group);
index 7380449cb5b3c535ddb81dee1e44c0ddb4124ebd..9965993748d02cc01c43f98db355b9bdb680d022 100644 (file)
@@ -97,7 +97,6 @@ void extent_io_tree_init(struct extent_io_tree *tree,
        spin_lock_init(&tree->lock);
        spin_lock_init(&tree->buffer_lock);
        tree->mapping = mapping;
-       tree->last = NULL;
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
@@ -173,12 +172,6 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
        struct tree_entry *entry;
        struct tree_entry *prev_entry = NULL;
 
-       if (tree->last) {
-               struct extent_state *state;
-               state = tree->last;
-               if (state->start <= offset && offset <= state->end)
-                       return &tree->last->rb_node;
-       }
        while(n) {
                entry = rb_entry(n, struct tree_entry, rb_node);
                prev = n;
@@ -189,7 +182,6 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
                else if (offset > entry->end)
                        n = n->rb_right;
                else {
-                       tree->last = rb_entry(n, struct extent_state, rb_node);
                        return n;
                }
        }
@@ -223,10 +215,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 
        ret = __etree_search(tree, offset, &prev, NULL);
        if (!ret) {
-               if (prev) {
-                       tree->last = rb_entry(prev, struct extent_state,
-                                             rb_node);
-               }
                return prev;
        }
        return ret;
@@ -301,8 +289,6 @@ static int merge_state(struct extent_io_tree *tree,
                    other->state == state->state) {
                        state->start = other->start;
                        other->tree = NULL;
-                       if (tree->last == other)
-                               tree->last = state;
                        rb_erase(&other->rb_node, &tree->state);
                        free_extent_state(other);
                }
@@ -314,8 +300,6 @@ static int merge_state(struct extent_io_tree *tree,
                    other->state == state->state) {
                        other->start = state->start;
                        state->tree = NULL;
-                       if (tree->last == state)
-                               tree->last = other;
                        rb_erase(&state->rb_node, &tree->state);
                        free_extent_state(state);
                }
@@ -378,7 +362,6 @@ static int insert_state(struct extent_io_tree *tree,
                return -EEXIST;
        }
        state->tree = tree;
-       tree->last = state;
        merge_state(tree, state);
        return 0;
 }
@@ -444,9 +427,6 @@ static int clear_state_bit(struct extent_io_tree *tree,
        if (delete || state->state == 0) {
                if (state->tree) {
                        clear_state_cb(tree, state, state->state);
-                       if (tree->last == state) {
-                               tree->last = extent_state_next(state);
-                       }
                        rb_erase(&state->rb_node, &tree->state);
                        state->tree = NULL;
                        free_extent_state(state);
index 6c03e6a1993836e94d001fdaacefe07ec639645d..315cfceae3128f72c59e4562b955763baa6dffde 100644 (file)
@@ -60,7 +60,6 @@ struct extent_io_tree {
        spinlock_t lock;
        spinlock_t buffer_lock;
        struct extent_io_ops *ops;
-       struct extent_state *last;
 };
 
 struct extent_state {
index 71b1ac155355685a0b7837829f4a0b88b01f7822..8a502ee2f2316e74d2c39c8633c75e62a2ccd17e 100644 (file)
@@ -42,7 +42,6 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
        tree->map.rb_node = NULL;
-       tree->last = NULL;
        spin_lock_init(&tree->lock);
 }
 EXPORT_SYMBOL(extent_map_tree_init);
@@ -239,7 +238,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
                merge->in_tree = 0;
                free_extent_map(merge);
        }
-       tree->last = em;
 out:
        return ret;
 }
@@ -273,10 +271,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
        u64 end = range_end(start, len);
 
        BUG_ON(spin_trylock(&tree->lock));
-       em = tree->last;
-       if (em && end > em->start && start < extent_map_end(em))
-               goto found;
-
        rb_node = __tree_search(&tree->map, start, &prev, &next);
        if (!rb_node && prev) {
                em = rb_entry(prev, struct extent_map, rb_node);
@@ -305,7 +299,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 
 found:
        atomic_inc(&em->refs);
-       tree->last = em;
 out:
        return em;
 }
@@ -327,8 +320,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
        BUG_ON(spin_trylock(&tree->lock));
        rb_erase(&em->rb_node, &tree->map);
        em->in_tree = 0;
-       if (tree->last == em)
-               tree->last = NULL;
        return ret;
 }
 EXPORT_SYMBOL(remove_extent_mapping);
index a3978ec2784661fa72c1384a88fd9ff9166b56cf..26ac6fe0b2682ed15df0467931d0913c9af6fa4f 100644 (file)
@@ -26,7 +26,6 @@ struct extent_map {
 
 struct extent_map_tree {
        struct rb_root map;
-       struct extent_map *last;
        spinlock_t lock;
 };
 
index 591a30208acd445a4eb58a461f9287ce01855a4d..e5ffb66ad320877aba8722ab681728717035ae21 100644 (file)
@@ -381,14 +381,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
                        break;
                }
                if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-                       start = em->start + em->len;
-                       free_extent_map(em);
-                       spin_unlock(&em_tree->lock);
-                       if (start < end) {
-                               len = end - start + 1;
-                               continue;
-                       }
-                       break;
+                       printk(KERN_CRIT "inode %lu trying to drop pinned "
+                              "extent start %llu end %llu, em [%llu %llu]\n",
+                              inode->i_ino,
+                              (unsigned long long)start,
+                              (unsigned long long)end,
+                              (unsigned long long)em->start,
+                              (unsigned long long)em->len);
                }
                remove_extent_mapping(em_tree, em);
 
index 60852ada658ef860f2df29911dd7804912a6ca36..3da12a4d913dad6db60508126c9053fe0350f733 100644 (file)
@@ -485,7 +485,7 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
        if (!fixup)
                return -EAGAIN;
-printk("queueing worker to fixup page %lu %Lu\n", inode->i_ino, page_offset(page));
+
        SetPageChecked(page);
        page_cache_get(page);
        fixup->work.func = btrfs_writepage_fixup_worker;
@@ -502,11 +502,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_map *em;
+       struct extent_map *em_orig;
        u64 alloc_hint = 0;
        u64 clear_start;
        u64 clear_end;
        struct list_head list;
        struct btrfs_key ins;
+       struct rb_node *rb;
        int ret;
 
        ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
@@ -535,6 +537,22 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
        mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
+       spin_lock(&em_tree->lock);
+       clear_start = ordered_extent->file_offset;
+       clear_end = ordered_extent->file_offset + ordered_extent->len;
+       em = lookup_extent_mapping(em_tree, clear_start,
+                                  ordered_extent->len);
+       em_orig = em;
+       while(em && clear_start < extent_map_end(em) && clear_end > em->start) {
+               clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+               rb = rb_next(&em->rb_node);
+               if (!rb)
+                       break;
+               em = rb_entry(rb, struct extent_map, rb_node);
+       }
+       free_extent_map(em_orig);
+       spin_unlock(&em_tree->lock);
+
        ret = btrfs_drop_extents(trans, root, inode,
                                 ordered_extent->file_offset,
                                 ordered_extent->file_offset +
@@ -548,22 +566,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                       ordered_extent->len, 0);
        BUG_ON(ret);
 
-       spin_lock(&em_tree->lock);
-       clear_start = ordered_extent->file_offset;
-       clear_end = ordered_extent->file_offset + ordered_extent->len;
-       while(clear_start < clear_end) {
-               em = lookup_extent_mapping(em_tree, clear_start,
-                                          clear_end - clear_start);
-               if (em) {
-                       clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-                       clear_start = em->start + em->len;
-                       free_extent_map(em);
-               } else {
-                       break;
-               }
-       }
-       spin_unlock(&em_tree->lock);
-
        btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
                                ordered_extent->file_offset +
                                ordered_extent->len - 1);
@@ -2318,7 +2320,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        u64 extent_end = 0;
        u64 objectid = inode->i_ino;
        u32 found_type;
-       struct btrfs_path *path;
+       struct btrfs_path *path = NULL;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_file_extent_item *item;
        struct extent_buffer *leaf;
@@ -2328,9 +2330,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
 
-       path = btrfs_alloc_path();
-       BUG_ON(!path);
-
 again:
        spin_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
@@ -2354,6 +2353,12 @@ again:
        em->bdev = root->fs_info->fs_devices->latest_bdev;
        em->start = EXTENT_MAP_HOLE;
        em->len = (u64)-1;
+
+       if (!path) {
+               path = btrfs_alloc_path();
+               BUG_ON(!path);
+       }
+
        ret = btrfs_lookup_file_extent(trans, root, path,
                                       objectid, start, trans != NULL);
        if (ret < 0) {
@@ -2530,7 +2535,8 @@ insert:
        }
        spin_unlock(&em_tree->lock);
 out:
-       btrfs_free_path(path);
+       if (path)
+               btrfs_free_path(path);
        if (trans) {
                ret = btrfs_end_transaction(trans, root);
                if (!err) {
@@ -2643,8 +2649,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
        return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-static int btrfs_writepages(struct address_space *mapping,
-                           struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping,
+                    struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
        tree = &BTRFS_I(mapping->host)->io_tree;
index 0d87795fdd8f95bce9c5840b9ffaf9e36933a4e3..830dbaea685350e9cd9f1ab1d6dc8b6a6c4f667d 100644 (file)
@@ -19,6 +19,8 @@
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
@@ -307,12 +309,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for pdflush to find them
         */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-       do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
-#else
-       do_sync_mapping_range(inode->i_mapping, start, end,
-                             SYNC_FILE_RANGE_WRITE);
-#endif
+       btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
        if (wait)
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
@@ -327,28 +324,26 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        u64 orig_end;
        u64 wait_end;
        struct btrfs_ordered_extent *ordered;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 
        if (start + len < start) {
-               wait_end = (inode->i_size + mask) & ~mask;
-               orig_end = (u64)-1;
+               orig_end = INT_LIMIT(loff_t);
        } else {
                orig_end = start + len - 1;
-               wait_end = orig_end;
+               if (orig_end > INT_LIMIT(loff_t))
+                       orig_end = INT_LIMIT(loff_t);
        }
+       wait_end = orig_end;
 again:
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-       do_sync_file_range(file, start, wait_end, SYNC_FILE_RANGE_WRITE);
-#else
-       do_sync_mapping_range(inode->i_mapping, start, wait_end,
-                             SYNC_FILE_RANGE_WRITE);
-#endif
-       end = orig_end;
-       wait_on_extent_writeback(&BTRFS_I(inode)->io_tree, start, orig_end);
+       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+
+       btrfs_wait_on_page_writeback_range(inode->i_mapping,
+                                          start >> PAGE_CACHE_SHIFT,
+                                          orig_end >> PAGE_CACHE_SHIFT);
 
+       end = orig_end;
        while(1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, end);
                if (!ordered) {
@@ -565,3 +560,87 @@ out:
        return ret;
 }
 
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping:   address space structure to write
+ * @start:     offset in bytes where the range starts
+ * @end:       offset in bytes where the range ends (inclusive)
+ * @sync_mode: enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                          loff_t end, int sync_mode)
+{
+       struct writeback_control wbc = {
+               .sync_mode = sync_mode,
+               .nr_to_write = mapping->nrpages * 2,
+               .range_start = start,
+               .range_end = end,
+               .for_writepages = 1,
+       };
+       return btrfs_writepages(mapping, &wbc);
+}
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:   target address_space
+ * @start:     beginning page index
+ * @end:       ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                      pgoff_t start, pgoff_t end)
+{
+       struct pagevec pvec;
+       int nr_pages;
+       int ret = 0;
+       pgoff_t index;
+
+       if (end < start)
+               return 0;
+
+       pagevec_init(&pvec, 0);
+       index = start;
+       while ((index <= end) &&
+                       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                       PAGECACHE_TAG_WRITEBACK,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+               unsigned i;
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       /* until radix tree lookup accepts end_index */
+                       if (page->index > end)
+                               continue;
+
+                       wait_on_page_writeback(page);
+                       if (PageError(page))
+                               ret = -EIO;
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+
+       /* Check for outstanding write errors */
+       if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+               ret = -ENOSPC;
+       if (test_and_clear_bit(AS_EIO, &mapping->flags))
+               ret = -EIO;
+
+       return ret;
+}
index 1794efd13ca3fab8d0884801b248dcf0730d17c2..8e8e3c0404f3090863699bc349afbf2fafe65586 100644 (file)
@@ -132,4 +132,8 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                      pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                          loff_t end, int sync_mode);
 #endif
index 05823904ecba686df1f3cdfc7361342d070508b8..38c75a0256cbbb17305fcacbfe4b9a10327b45bc 100644 (file)
@@ -649,7 +649,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        extent_io_tree_init(pinned_copy,
                             root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
-printk("commit trans %Lu\n", trans->transid);
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
        cur_trans = trans->transaction;
@@ -745,7 +744,6 @@ printk("commit trans %Lu\n", trans->transid);
                list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
        mutex_unlock(&root->fs_info->trans_mutex);
-printk("done commit trans %Lu\n", trans->transid);
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
        if (root->fs_info->closing) {