btrfs: disable strict file flushes for renames and truncates
authorChris Mason <clm@fb.com>
Tue, 12 Aug 2014 17:47:42 +0000 (10:47 -0700)
committerChris Mason <clm@fb.com>
Fri, 15 Aug 2014 14:43:42 +0000 (07:43 -0700)
Truncates and renames are often used to replace old versions of a file
with new versions.  Applications often expect this to be an atomic
replacement, even if they haven't done anything to make sure the new
version is fully on disk.

Btrfs has strict flushing in place to make sure that renaming over an
old file with a new file will fully flush out the new file before
allowing the transaction commit with the rename to complete.

This ordering means the commit code needs to be able to lock file pages,
and there are a few paths in the filesystem where we will try to end a
transaction with the page lock held.  It's rare, but these things can
deadlock.

This patch removes the ordered flushes and switches to a best effort
filemap_flush like ext4 uses. It's not perfect, but it should fix the
deadlocks.

Signed-off-by: Chris Mason <clm@fb.com>
fs/btrfs/btrfs_inode.h
fs/btrfs/disk-io.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index 4794923c410cec258ae30aa929f3fc0bde5ba069..43527fd78825b690503347426118e09f76017c95 100644 (file)
@@ -84,12 +84,6 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
 
-       /*
-        * list for tracking inodes that must be sent to disk before a
-        * rename or truncate commit
-        */
-       struct list_head ordered_operations;
-
        /* node for the red-black tree that links inodes in subvolume root */
        struct rb_node rb_node;
 
index 08e65e9cf2aa97cb009249f5bf231d07e2c1f890..d0ed9e664f7d4d26c69a12498f80b3ad59809a43 100644 (file)
@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                                    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
-                                            struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
        btrfs_cleanup_transaction(root);
 }
 
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
-                                            struct btrfs_root *root)
-{
-       struct btrfs_inode *btrfs_inode;
-       struct list_head splice;
-
-       INIT_LIST_HEAD(&splice);
-
-       mutex_lock(&root->fs_info->ordered_operations_mutex);
-       spin_lock(&root->fs_info->ordered_root_lock);
-
-       list_splice_init(&t->ordered_operations, &splice);
-       while (!list_empty(&splice)) {
-               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                        ordered_operations);
-
-               list_del_init(&btrfs_inode->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_root_lock);
-
-               btrfs_invalidate_inodes(btrfs_inode->root);
-
-               spin_lock(&root->fs_info->ordered_root_lock);
-       }
-
-       spin_unlock(&root->fs_info->ordered_root_lock);
-       mutex_unlock(&root->fs_info->ordered_operations_mutex);
-}
-
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
        struct btrfs_ordered_extent *ordered;
@@ -4093,8 +4063,6 @@ again:
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
 {
-       btrfs_destroy_ordered_operations(cur_trans, root);
-
        btrfs_destroy_delayed_refs(cur_trans, root);
 
        cur_trans->state = TRANS_STATE_COMMIT_START;
index 1f2b99cb55eaef682c51ae3c8b227e88aafbf7e7..d3afac292d677e0ed492255ca7cb3890cde717b1 100644 (file)
@@ -1838,33 +1838,9 @@ out:
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
-       /*
-        * ordered_data_close is set by settattr when we are about to truncate
-        * a file from a non-zero size to a zero size.  This tries to
-        * flush down new bytes that may have been written if the
-        * application were using truncate to replace a file in place.
-        */
-       if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                              &BTRFS_I(inode)->runtime_flags)) {
-               struct btrfs_trans_handle *trans;
-               struct btrfs_root *root = BTRFS_I(inode)->root;
-
-               /*
-                * We need to block on a committing transaction to keep us from
-                * throwing a ordered operation on to the list and causing
-                * something like sync to deadlock trying to flush out this
-                * inode.
-                */
-               trans = btrfs_start_transaction(root, 0);
-               if (IS_ERR(trans))
-                       return PTR_ERR(trans);
-               btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
-               btrfs_end_transaction(trans, root);
-               if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
-                       filemap_flush(inode->i_mapping);
-       }
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
+       filemap_flush(inode->i_mapping);
        return 0;
 }
 
index 8ea7610fbaf3eb182321951f96f3fc1c2b498a68..73098328d040983d0c0296425b7a1e9e7961ce90 100644 (file)
@@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode)
                                      min_size);
        BUG_ON(ret);
 
-       /*
-        * setattr is responsible for setting the ordered_data_close flag,
-        * but that is only tested during the last file release.  That
-        * could happen well after the next commit, leaving a great big
-        * window where new writes may get lost if someone chooses to write
-        * to this file after truncating to zero
-        *
-        * The inode doesn't have any dirty data here, and so if we commit
-        * this is a noop.  If someone immediately starts writing to the inode
-        * it is very likely we'll catch some of their writes in this
-        * transaction, and the commit will find this file on the ordered
-        * data list with good things to send down.
-        *
-        * This is a best effort solution, there is still a window where
-        * using truncate to replace the contents of the file will
-        * end up with a zero length file after a crash.
-        */
-       if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                                          &BTRFS_I(inode)->runtime_flags))
-               btrfs_add_ordered_operation(trans, root, inode);
-
        /*
         * So if we truncate and then write and fsync we normally would just
         * write the extents that changed, which is a problem if we need to
@@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
-       INIT_LIST_HEAD(&ei->ordered_operations);
        RB_CLEAR_NODE(&ei->rb_node);
 
        return inode;
@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
        if (!root)
                goto free;
 
-       /*
-        * Make sure we're properly removed from the ordered operation
-        * lists.
-        */
-       smp_mb();
-       if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-               spin_lock(&root->fs_info->ordered_root_lock);
-               list_del_init(&BTRFS_I(inode)->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_root_lock);
-       }
-
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                     &BTRFS_I(inode)->runtime_flags)) {
                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        ret = 0;
 
        /*
-        * we're using rename to replace one file with another.
-        * and the replacement file is large.  Start IO on it now so
-        * we don't add too much work to the end of the transaction
+        * we're using rename to replace one file with another.  Start IO on it
+        * now so  we don't add too much work to the end of the transaction
         */
-       if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-           old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+       if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
                filemap_flush(old_inode->i_mapping);
 
        /* close the racy window with snapshot create/destroy ioctl */
@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 */
                btrfs_pin_log_trans(root);
        }
-       /*
-        * make sure the inode gets flushed if it is replacing
-        * something.
-        */
-       if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-               btrfs_add_ordered_operation(trans, root, old_inode);
 
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
index 7187b14faa6cd0c1c846fcfb155f431102ce8bad..963895c1f801dcbbde3ce9c5e621c91308793f7a 100644 (file)
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 
        trace_btrfs_ordered_extent_remove(inode, entry);
 
-       /*
-        * we have no more ordered extents for this inode and
-        * no dirty pages.  We can safely remove it from the
-        * list of ordered extents
-        */
-       if (RB_EMPTY_ROOT(&tree->tree) &&
-           !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-               spin_lock(&root->fs_info->ordered_root_lock);
-               list_del_init(&BTRFS_I(inode)->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_root_lock);
-       }
-
        if (!root->nr_ordered_extents) {
                spin_lock(&root->fs_info->ordered_root_lock);
                BUG_ON(list_empty(&root->ordered_root));
@@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
        mutex_unlock(&fs_info->ordered_operations_mutex);
 }
 
-/*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list.  These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io.  When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, int wait)
-{
-       struct btrfs_inode *btrfs_inode;
-       struct inode *inode;
-       struct btrfs_transaction *cur_trans = trans->transaction;
-       struct list_head splice;
-       struct list_head works;
-       struct btrfs_delalloc_work *work, *next;
-       int ret = 0;
-
-       INIT_LIST_HEAD(&splice);
-       INIT_LIST_HEAD(&works);
-
-       mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
-       spin_lock(&root->fs_info->ordered_root_lock);
-       list_splice_init(&cur_trans->ordered_operations, &splice);
-       while (!list_empty(&splice)) {
-               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                  ordered_operations);
-               inode = &btrfs_inode->vfs_inode;
-
-               list_del_init(&btrfs_inode->ordered_operations);
-
-               /*
-                * the inode may be getting freed (in sys_unlink path).
-                */
-               inode = igrab(inode);
-               if (!inode)
-                       continue;
-
-               if (!wait)
-                       list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                                     &cur_trans->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_root_lock);
-
-               work = btrfs_alloc_delalloc_work(inode, wait, 1);
-               if (!work) {
-                       spin_lock(&root->fs_info->ordered_root_lock);
-                       if (list_empty(&BTRFS_I(inode)->ordered_operations))
-                               list_add_tail(&btrfs_inode->ordered_operations,
-                                             &splice);
-                       list_splice_tail(&splice,
-                                        &cur_trans->ordered_operations);
-                       spin_unlock(&root->fs_info->ordered_root_lock);
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               list_add_tail(&work->list, &works);
-               btrfs_queue_work(root->fs_info->flush_workers,
-                                &work->work);
-
-               cond_resched();
-               spin_lock(&root->fs_info->ordered_root_lock);
-       }
-       spin_unlock(&root->fs_info->ordered_root_lock);
-out:
-       list_for_each_entry_safe(work, next, &works, list) {
-               list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
-       }
-       mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
-       return ret;
-}
-
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
@@ -1120,42 +1033,6 @@ out:
        return index;
 }
 
-
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, struct inode *inode)
-{
-       struct btrfs_transaction *cur_trans = trans->transaction;
-       u64 last_mod;
-
-       last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-
-       /*
-        * if this file hasn't been changed since the last transaction
-        * commit, we can safely return without doing anything
-        */
-       if (last_mod <= root->fs_info->last_trans_committed)
-               return;
-
-       spin_lock(&root->fs_info->ordered_root_lock);
-       if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
-               list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                             &cur_trans->ordered_operations);
-       }
-       spin_unlock(&root->fs_info->ordered_root_lock);
-}
-
 int __init ordered_data_init(void)
 {
        btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
index 246897058efb009ffd521cee2d61cb60fc0bde0f..d81a274d621ee47cf2510b2323997d67567a3a88 100644 (file)
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                           u32 *sum, int len);
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, int wait);
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                struct inode *inode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct inode *inode,
index 5f379affdf236119f4ab031ad6843a822a0ec24c..d89c6d3542cab4c55372954ae97e9e32ec1ba443 100644 (file)
@@ -218,7 +218,6 @@ loop:
        spin_lock_init(&cur_trans->delayed_refs.lock);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-       INIT_LIST_HEAD(&cur_trans->ordered_operations);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
 
-static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root)
-{
-       int ret;
-
-       ret = btrfs_run_delayed_items(trans, root);
-       if (ret)
-               return ret;
-
-       /*
-        * rename don't use btrfs_join_transaction, so, once we
-        * set the transaction to blocked above, we aren't going
-        * to get any new ordered operations.  We can safely run
-        * it here and no for sure that nothing new will be added
-        * to the list
-        */
-       ret = btrfs_run_ordered_operations(trans, root, 1);
-
-       return ret;
-}
-
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *prev_trans = NULL;
        int ret;
 
-       ret = btrfs_run_ordered_operations(trans, root, 0);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               btrfs_end_transaction(trans, root);
-               return ret;
-       }
-
        /* Stop the commit early if ->aborted is set */
        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        if (ret)
                goto cleanup_transaction;
 
-       ret = btrfs_flush_all_pending_stuffs(trans, root);
+       ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                goto cleanup_transaction;
 
@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                   extwriter_counter_read(cur_trans) == 0);
 
        /* some pending stuffs might be added after the previous flush. */
-       ret = btrfs_flush_all_pending_stuffs(trans, root);
+       ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                goto cleanup_transaction;
 
index 7dd558ed0716526fa5700c179d2f0606b8a53219..579be51b27e5ee0970feb4fb21ddf54a1d621a12 100644 (file)
@@ -55,7 +55,6 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
-       struct list_head ordered_operations;
        struct list_head pending_chunks;
        struct list_head switch_commits;
        struct btrfs_delayed_ref_root delayed_refs;