Btrfs: Use PagePrivate2 to track pages in the data=ordered code.
authorChris Mason <chris.mason@oracle.com>
Wed, 2 Sep 2009 20:53:46 +0000 (16:53 -0400)
committerChris Mason <chris.mason@oracle.com>
Fri, 11 Sep 2009 17:31:07 +0000 (13:31 -0400)
Btrfs writes go through delalloc to the data=ordered code.  This
makes sure that all of the data is on disk before the metadata
that references it.  The tracking means that we have to make sure
each page in an extent is fully written before we add that extent into
the on-disk btree.

This was done in the past by setting the EXTENT_ORDERED bit for the
range of an extent when it was added to the data=ordered code, and then
clearing the EXTENT_ORDERED bit in the extent state tree as each page
finished IO.

One of the reasons we had to do this was because sometimes pages are
magically dirtied without page_mkwrite being called.  The EXTENT_ORDERED
bit is checked at writepage time, and if it isn't there, our page become
dirty without going through the proper path.

These bit operations make for a number of rbtree searches for each page,
and can cause considerable lock contention.

This commit switches from the EXTENT_ORDERED bit to use PagePrivate2.
As pages go into the ordered code, PagePrivate2 is set on each one.
This is a cheap operation because we already have all the pages locked
and ready to go.

As IO finishes, the PagePrivate2 bit is cleared and the ordered
accoutning is updated for each page.

At writepage time, if the PagePrivate2 bit is missing, we go into the
writepage fixup code to handle improperly dirtied pages.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h

index c9a438d374b6868c66c126fd5f84b28ecf685463..a102422cd92eb10e626214b1fa4873627f852f64 100644 (file)
@@ -885,13 +885,6 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, mask);
 }
 
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                      gfp_t mask)
-{
-       return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, NULL,
-                             mask);
-}
-
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask)
 {
@@ -921,13 +914,6 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                                NULL, mask);
 }
 
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
-{
-       return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0,
-                               NULL, mask);
-}
-
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
@@ -1373,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                int clear_unlock,
                                int clear_delalloc, int clear_dirty,
                                int set_writeback,
-                               int end_writeback)
+                               int end_writeback,
+                               int set_private2)
 {
        int ret;
        struct page *pages[16];
@@ -1392,7 +1379,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                clear_bits |= EXTENT_DELALLOC;
 
        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-       if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+       if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
+             set_private2))
                return 0;
 
        while (nr_pages > 0) {
@@ -1400,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                     min_t(unsigned long,
                                     nr_pages, ARRAY_SIZE(pages)), pages);
                for (i = 0; i < ret; i++) {
+
+                       if (set_private2)
+                               SetPagePrivate2(pages[i]);
+
                        if (pages[i] == locked_page) {
                                page_cache_release(pages[i]);
                                continue;
@@ -2792,7 +2784,7 @@ int try_release_extent_state(struct extent_map_tree *map,
        int ret = 1;
 
        if (test_range_bit(tree, start, end,
-                          EXTENT_IOBITS | EXTENT_ORDERED, 0, NULL))
+                          EXTENT_IOBITS, 0, NULL))
                ret = 0;
        else {
                if ((mask & GFP_NOFS) == GFP_NOFS)
@@ -2835,8 +2827,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                        }
                        if (!test_range_bit(tree, em->start,
                                            extent_map_end(em) - 1,
-                                           EXTENT_LOCKED | EXTENT_WRITEBACK |
-                                           EXTENT_ORDERED,
+                                           EXTENT_LOCKED | EXTENT_WRITEBACK,
                                            0, NULL)) {
                                remove_extent_mapping(map, em);
                                /* once for the rb tree */
index 09cd6fa3cc86ecf2d6bdbb53e04bbe26af65b6ed..14ed16fd862df22a93b7286c4c4811a8c12fa6ba 100644 (file)
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
@@ -285,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                int clear_unlock,
                                int clear_delalloc, int clear_dirty,
                                int set_writeback,
-                               int end_writeback);
+                               int end_writeback,
+                               int set_private2);
 #endif
index 3f8e93de298948183d5f036d3dfe24c1955aaa0f..739a245e25d601f02e895a720f7c59ad46e60ce9 100644 (file)
@@ -426,7 +426,7 @@ again:
                        extent_clear_unlock_delalloc(inode,
                                                     &BTRFS_I(inode)->io_tree,
                                                     start, end, NULL, 1, 0,
-                                                    0, 1, 1, 1);
+                                                    0, 1, 1, 1, 0);
                        ret = 0;
                        goto free_pages_out;
                }
@@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                             async_extent->start,
                                             async_extent->start +
                                             async_extent->ram_size - 1,
-                                            NULL, 1, 1, 0, 1, 1, 0);
+                                            NULL, 1, 1, 0, 1, 1, 0, 0);
 
                ret = btrfs_submit_compressed_write(inode,
                                    async_extent->start,
@@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
                        extent_clear_unlock_delalloc(inode,
                                                     &BTRFS_I(inode)->io_tree,
                                                     start, end, NULL, 1, 1,
-                                                    1, 1, 1, 1);
+                                                    1, 1, 1, 1, 0);
                        *nr_written = *nr_written +
                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
                        *page_started = 1;
@@ -777,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode,
                /* we're not doing compressed IO, don't unlock the first
                 * page (which the caller expects to stay locked), don't
                 * clear any dirty bits and don't set any writeback bits
+                *
+                * Do set the Private2 bit so we know this page was properly
+                * setup for writepage
                 */
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                             start, start + ram_size - 1,
                                             locked_page, unlock, 1,
-                                            1, 0, 0, 0);
+                                            1, 0, 0, 0, 1);
                disk_num_bytes -= cur_alloc_size;
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
@@ -1102,7 +1105,7 @@ out_check:
 
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                        cur_offset, cur_offset + num_bytes - 1,
-                                       locked_page, 1, 1, 1, 0, 0, 0);
+                                       locked_page, 1, 1, 1, 0, 0, 0, 1);
                cur_offset = extent_end;
                if (cur_offset > end)
                        break;
@@ -1375,10 +1378,8 @@ again:
        lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
 
        /* already ordered? We're done */
-       if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                            EXTENT_ORDERED, 0, NULL)) {
+       if (PagePrivate2(page))
                goto out;
-       }
 
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
@@ -1414,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        struct inode *inode = page->mapping->host;
        struct btrfs_writepage_fixup *fixup;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret;
 
-       ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
-                            EXTENT_ORDERED, 0, NULL);
-       if (ret)
+       /* this page is properly in the ordered list */
+       if (TestClearPagePrivate2(page))
                return 0;
 
        if (PageChecked(page))
@@ -1624,6 +1623,7 @@ nocow:
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
 {
+       ClearPagePrivate2(page);
        return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
@@ -4403,13 +4403,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+
+       /*
+        * we have the page locked, so new writeback can't start,
+        * and the dirty bit won't be cleared while we are here.
+        *
+        * Wait for IO on this page so that we can safely clear
+        * the PagePrivate2 bit and do ordered accounting
+        */
        wait_on_page_writeback(page);
+
        tree = &BTRFS_I(page->mapping->host)->io_tree;
        if (offset) {
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
-
        lock_extent(tree, page_start, page_end, GFP_NOFS);
        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
                                           page_offset(page));
@@ -4421,14 +4429,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
                                 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
-               btrfs_finish_ordered_io(page->mapping->host,
-                                       page_start, page_end);
+               /*
+                * whoever cleared the private bit is responsible
+                * for the finish_ordered_io
+                */
+               if (TestClearPagePrivate2(page)) {
+                       btrfs_finish_ordered_io(page->mapping->host,
+                                               page_start, page_end);
+               }
                btrfs_put_ordered_extent(ordered);
                lock_extent(tree, page_start, page_end, GFP_NOFS);
        }
        clear_extent_bit(tree, page_start, page_end,
-                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-                EXTENT_ORDERED,
+                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
                 1, 1, NULL, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
 
index 7f751e462f0b88a1af8badea9478b88c510a7d29..4a9c8c4cec2552de9c10a25708d320e9c6b930d8 100644 (file)
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  *
  * len is the length of the extent
  *
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
  * The tree is given a single reference on the ordered extent that was
  * inserted.
  */
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->start = start;
        entry->len = len;
        entry->disk_len = disk_len;
+       entry->bytes_left = len;
        entry->inode = inode;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                           &entry->rb_node);
        BUG_ON(node);
 
-       set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
-                          entry_end(entry) - 1, GFP_NOFS);
-
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        int ret;
 
        tree = &BTRFS_I(inode)->ordered_tree;
        mutex_lock(&tree->mutex);
-       clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
-                            GFP_NOFS);
        node = tree_search(tree, file_offset);
        if (!node) {
                ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                goto out;
        }
 
-       ret = test_range_bit(io_tree, entry->file_offset,
-                            entry->file_offset + entry->len - 1,
-                            EXTENT_ORDERED, 0, NULL);
-       if (ret == 0)
+       if (io_size > entry->bytes_left) {
+               printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                      (unsigned long long)entry->bytes_left,
+                      (unsigned long long)io_size);
+       }
+       entry->bytes_left -= io_size;
+       if (entry->bytes_left == 0)
                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+       else
+               ret = 1;
 out:
        mutex_unlock(&tree->mutex);
        return ret == 0;
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        u64 orig_end;
        u64 wait_end;
        struct btrfs_ordered_extent *ordered;
+       int found;
 
        if (start + len < start) {
                orig_end = INT_LIMIT(loff_t);
@@ -502,6 +501,7 @@ again:
                                           orig_end >> PAGE_CACHE_SHIFT);
 
        end = orig_end;
+       found = 0;
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, end);
                if (!ordered)
@@ -514,6 +514,7 @@ again:
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
+               found++;
                btrfs_start_ordered_extent(inode, ordered, 1);
                end = ordered->file_offset;
                btrfs_put_ordered_extent(ordered);
@@ -521,8 +522,8 @@ again:
                        break;
                end--;
        }
-       if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-                          EXTENT_ORDERED | EXTENT_DELALLOC, 0, NULL)) {
+       if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+                          EXTENT_DELALLOC, 0, NULL)) {
                schedule_timeout(1);
                goto again;
        }
index 3d31c8827b013407d6f4b14796896a6aac8ba53b..993a7ea45c702a580c784908584408684913e233 100644 (file)
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
        /* extent length on disk */
        u64 disk_len;
 
+       /* number of bytes that still need writing */
+       u64 bytes_left;
+
        /* flags (described above) */
        unsigned long flags;