Btrfs: fix the reserved space leak caused by the race between nonlock dio and buffered io
[sfrench/cifs-2.6.git] / fs / btrfs / file.c
index 82d0342763c54d652982d2818ab3b9b7f8ffc1ec..6cd003c3f05ec318092b766fbc0e6322175911e2 100644 (file)
@@ -1235,29 +1235,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
 }
 
 /*
- * this gets pages into the page cache and locks them down, it also properly
- * waits for data=ordered extents to finish before allowing the pages to be
- * modified.
+ * this just gets pages into the page cache and locks them down.
  */
-static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
-                        struct page **pages, size_t num_pages,
-                        loff_t pos, unsigned long first_index,
-                        size_t write_bytes, bool force_uptodate)
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+                                 size_t num_pages, loff_t pos,
+                                 size_t write_bytes, bool force_uptodate)
 {
-       struct extent_state *cached_state = NULL;
        int i;
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
-       struct inode *inode = file_inode(file);
        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-       int err = 0;
-       int faili = 0;
-       u64 start_pos;
-       u64 last_pos;
-
-       start_pos = pos & ~((u64)root->sectorsize - 1);
-       last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+       int err;
+       int faili;
 
-again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
                                               mask | __GFP_WRITE);
@@ -1280,57 +1269,85 @@ again:
                }
                wait_on_page_writeback(pages[i]);
        }
-       faili = num_pages - 1;
-       err = 0;
+
+       return 0;
+fail:
+       while (faili >= 0) {
+               unlock_page(pages[faili]);
+               page_cache_release(pages[faili]);
+               faili--;
+       }
+       return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+                               size_t num_pages, loff_t pos,
+                               u64 *lockstart, u64 *lockend,
+                               struct extent_state **cached_state)
+{
+       u64 start_pos;
+       u64 last_pos;
+       int i;
+       int ret = 0;
+
+       start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+       last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                                start_pos, last_pos - 1, 0, &cached_state);
-               ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                           last_pos - 1);
+                                start_pos, last_pos, 0, cached_state);
+               ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
                if (ordered &&
                    ordered->file_offset + ordered->len > start_pos &&
-                   ordered->file_offset < last_pos) {
+                   ordered->file_offset <= last_pos) {
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                            start_pos, last_pos - 1,
-                                            &cached_state, GFP_NOFS);
+                                            start_pos, last_pos,
+                                            cached_state, GFP_NOFS);
                        for (i = 0; i < num_pages; i++) {
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                       err = btrfs_wait_ordered_range(inode, start_pos,
-                                                      last_pos - start_pos);
-                       if (err)
-                               goto fail;
-                       goto again;
+                       ret = btrfs_wait_ordered_range(inode, start_pos,
+                                               last_pos - start_pos + 1);
+                       if (ret)
+                               return ret;
+                       else
+                               return -EAGAIN;
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
 
                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
-                                 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                                 0, 0, &cached_state, GFP_NOFS);
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                    start_pos, last_pos - 1, &cached_state,
-                                    GFP_NOFS);
+                                 0, 0, cached_state, GFP_NOFS);
+               *lockstart = start_pos;
+               *lockend = last_pos;
+               ret = 1;
        }
+
        for (i = 0; i < num_pages; i++) {
                if (clear_page_dirty_for_io(pages[i]))
                        account_page_redirty(pages[i]);
                set_page_extent_mapped(pages[i]);
                WARN_ON(!PageLocked(pages[i]));
        }
-       return 0;
-fail:
-       while (faili >= 0) {
-               unlock_page(pages[faili]);
-               page_cache_release(pages[faili]);
-               faili--;
-       }
-       return err;
 
+       return ret;
 }
 
 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1381,13 +1398,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
+       struct extent_state *cached_state = NULL;
        u64 release_bytes = 0;
+       u64 lockstart;
+       u64 lockend;
        unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
        bool only_release_metadata = false;
        bool force_page_uptodate = false;
+       bool need_unlock;
 
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1456,18 +1477,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
 
                release_bytes = reserve_bytes;
-
+               need_unlock = false;
+again:
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
                 * contents of pages from loop to loop
                 */
-               ret = prepare_pages(root, file, pages, num_pages,
-                                   pos, first_index, write_bytes,
+               ret = prepare_pages(inode, pages, num_pages,
+                                   pos, write_bytes,
                                    force_page_uptodate);
                if (ret)
                        break;
 
+               ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+                                                     pos, &lockstart, &lockend,
+                                                     &cached_state);
+               if (ret < 0) {
+                       if (ret == -EAGAIN)
+                               goto again;
+                       break;
+               } else if (ret > 0) {
+                       need_unlock = true;
+                       ret = 0;
+               }
+
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, i);
 
@@ -1512,19 +1546,20 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
 
                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
-               if (copied > 0) {
+
+               if (copied > 0)
                        ret = btrfs_dirty_pages(root, inode, pages,
                                                dirty_pages, pos, copied,
                                                NULL);
-                       if (ret) {
-                               btrfs_drop_pages(pages, num_pages);
-                               break;
-                       }
-               }
-
-               release_bytes = 0;
+               if (need_unlock)
+                       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                            lockstart, lockend, &cached_state,
+                                            GFP_NOFS);
                btrfs_drop_pages(pages, num_pages);
+               if (ret)
+                       break;
 
+               release_bytes = 0;
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
                        u64 lockend = lockstart +
@@ -1963,11 +1998,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_key key;
        int ret;
 
+       if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+               goto out;
+
        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = offset;
 
-
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0)
                return ret;
@@ -2064,8 +2101,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        u64 drop_end;
        int ret = 0;
        int err = 0;
+       int rsv_count;
        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+       bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
@@ -2163,9 +2202,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        /*
         * 1 - update the inode
         * 1 - removing the extents in the range
-        * 1 - adding the hole extent
+        * 1 - adding the hole extent if no_holes isn't set
         */
-       trans = btrfs_start_transaction(root, 3);
+       rsv_count = no_holes ? 2 : 3;
+       trans = btrfs_start_transaction(root, rsv_count);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_free;
@@ -2202,7 +2242,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(root);
 
-               trans = btrfs_start_transaction(root, 3);
+               trans = btrfs_start_transaction(root, rsv_count);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;