Merge tag 'for-5.10-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 Nov 2020 20:42:13 +0000 (12:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 Nov 2020 20:42:13 +0000 (12:42 -0800)
Pull btrfs fixes from David Sterba:
 "A few fixes for various warnings that accumulated over past two weeks:

   - tree-checker: add missing return values for some errors

   - lockdep fixes
      - when reading qgroup config and starting quota rescan
      - reverse order of quota ioctl lock and VFS freeze lock

   - avoid accessing potentially stale fs info during device scan,
     reported by syzbot

   - add scope NOFS protection around qgroup relation changes

   - check for running transaction before flushing qgroups

   - fix tracking of new delalloc ranges for some cases"

* tag 'for-5.10-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix lockdep splat when enabling and disabling qgroups
  btrfs: do nofs allocations when adding and removing qgroup relations
  btrfs: fix lockdep splat when reading qgroup config on mount
  btrfs: tree-checker: add missing returns after data_ref alignment checks
  btrfs: don't access possibly stale fs_info data for printing duplicate device
  btrfs: tree-checker: add missing return after error in root_item
  btrfs: qgroup: don't commit transaction when we already hold the handle
  btrfs: fix missing delalloc new bit for new delalloc ranges

fs/btrfs/ctree.h
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/qgroup.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/tree-checker.c
fs/btrfs/volumes.c

index 0378933d163c6d27dca020d960dc0a782fd25fba..0b29bdb251050c60c555c4e1de6c22fd6061838e 100644 (file)
@@ -878,7 +878,10 @@ struct btrfs_fs_info {
         */
        struct ulist *qgroup_ulist;
 
-       /* protect user change for quota operations */
+       /*
+        * Protect user change for quota operations. If a transaction is needed,
+        * it must be started before locking this lock.
+        */
        struct mutex qgroup_ioctl_lock;
 
        /* list of dirty qgroups to be written at next commit */
index 87355a38a654702e154e3c9490d0c3da590743d6..4373da7bcc0d58a3d63fe33afdbd6547a17bf7a1 100644 (file)
@@ -452,46 +452,6 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
        }
 }
 
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
-                                        const u64 start,
-                                        const u64 len,
-                                        struct extent_state **cached_state)
-{
-       u64 search_start = start;
-       const u64 end = start + len - 1;
-
-       while (search_start < end) {
-               const u64 search_len = end - search_start + 1;
-               struct extent_map *em;
-               u64 em_len;
-               int ret = 0;
-
-               em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
-               if (IS_ERR(em))
-                       return PTR_ERR(em);
-
-               if (em->block_start != EXTENT_MAP_HOLE)
-                       goto next;
-
-               em_len = em->len;
-               if (em->start < search_start)
-                       em_len -= search_start - em->start;
-               if (em_len > search_len)
-                       em_len = search_len;
-
-               ret = set_extent_bit(&inode->io_tree, search_start,
-                                    search_start + em_len - 1,
-                                    EXTENT_DELALLOC_NEW,
-                                    NULL, cached_state, GFP_NOFS);
-next:
-               search_start = extent_map_end(em);
-               free_extent_map(em);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
@@ -528,23 +488,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                         0, 0, cached);
 
-       if (!btrfs_is_free_space_inode(inode)) {
-               if (start_pos >= isize &&
-                   !(inode->flags & BTRFS_INODE_PREALLOC)) {
-                       /*
-                        * There can't be any extents following eof in this case
-                        * so just set the delalloc new bit for the range
-                        * directly.
-                        */
-                       extra_bits |= EXTENT_DELALLOC_NEW;
-               } else {
-                       err = btrfs_find_new_delalloc_bytes(inode, start_pos,
-                                                           num_bytes, cached);
-                       if (err)
-                               return err;
-               }
-       }
-
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
                                        extra_bits, cached);
        if (err)
index da58c58ef9aa1cbb598306ca81af3583979b5f1b..7e8d8169779d248def4688b8f80ad8c10be4e83b 100644 (file)
@@ -2253,11 +2253,69 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+                                        const u64 start,
+                                        const u64 len,
+                                        struct extent_state **cached_state)
+{
+       u64 search_start = start;
+       const u64 end = start + len - 1;
+
+       while (search_start < end) {
+               const u64 search_len = end - search_start + 1;
+               struct extent_map *em;
+               u64 em_len;
+               int ret = 0;
+
+               em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+               if (IS_ERR(em))
+                       return PTR_ERR(em);
+
+               if (em->block_start != EXTENT_MAP_HOLE)
+                       goto next;
+
+               em_len = em->len;
+               if (em->start < search_start)
+                       em_len -= search_start - em->start;
+               if (em_len > search_len)
+                       em_len = search_len;
+
+               ret = set_extent_bit(&inode->io_tree, search_start,
+                                    search_start + em_len - 1,
+                                    EXTENT_DELALLOC_NEW,
+                                    NULL, cached_state, GFP_NOFS);
+next:
+               search_start = extent_map_end(em);
+               free_extent_map(em);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                              unsigned int extra_bits,
                              struct extent_state **cached_state)
 {
        WARN_ON(PAGE_ALIGNED(end));
+
+       if (start >= i_size_read(&inode->vfs_inode) &&
+           !(inode->flags & BTRFS_INODE_PREALLOC)) {
+               /*
+                * There can't be any extents following eof in this case so just
+                * set the delalloc new bit for the range directly.
+                */
+               extra_bits |= EXTENT_DELALLOC_NEW;
+       } else {
+               int ret;
+
+               ret = btrfs_find_new_delalloc_bytes(inode, start,
+                                                   end + 1 - start,
+                                                   cached_state);
+               if (ret)
+                       return ret;
+       }
+
        return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
                                   cached_state);
 }
index 77c54749f432921b8812d8c6ed475055e9780f49..87bd37b70738ec58676f0d338a13906f1f9b6db6 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/btrfs.h>
+#include <linux/sched/mm.h>
 
 #include "ctree.h"
 #include "transaction.h"
@@ -497,13 +498,13 @@ next2:
                        break;
        }
 out:
+       btrfs_free_path(path);
        fs_info->qgroup_flags |= flags;
        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
                clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
                 ret >= 0)
                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-       btrfs_free_path(path);
 
        if (ret < 0) {
                ulist_free(fs_info->qgroup_ulist);
@@ -936,6 +937,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        struct btrfs_key found_key;
        struct btrfs_qgroup *qgroup = NULL;
        struct btrfs_trans_handle *trans = NULL;
+       struct ulist *ulist = NULL;
        int ret = 0;
        int slot;
 
@@ -943,8 +945,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        if (fs_info->quota_root)
                goto out;
 
-       fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
-       if (!fs_info->qgroup_ulist) {
+       ulist = ulist_alloc(GFP_KERNEL);
+       if (!ulist) {
                ret = -ENOMEM;
                goto out;
        }
@@ -952,6 +954,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        ret = btrfs_sysfs_add_qgroups(fs_info);
        if (ret < 0)
                goto out;
+
+       /*
+        * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+        * avoid lock acquisition inversion problems (reported by lockdep) between
+        * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+        * start a transaction.
+        * After we started the transaction lock qgroup_ioctl_lock again and
+        * check if someone else created the quota root in the meanwhile. If so,
+        * just return success and release the transaction handle.
+        *
+        * Also we don't need to worry about someone else calling
+        * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+        * that function returns 0 (success) when the sysfs entries already exist.
+        */
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
        /*
         * 1 for quota root item
         * 1 for BTRFS_QGROUP_STATUS item
@@ -961,12 +979,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
         * would be a lot of overkill.
         */
        trans = btrfs_start_transaction(tree_root, 2);
+
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
                goto out;
        }
 
+       if (fs_info->quota_root)
+               goto out;
+
+       fs_info->qgroup_ulist = ulist;
+       ulist = NULL;
+
        /*
         * initially create the quota tree
         */
@@ -1124,11 +1150,14 @@ out:
        if (ret) {
                ulist_free(fs_info->qgroup_ulist);
                fs_info->qgroup_ulist = NULL;
-               if (trans)
-                       btrfs_end_transaction(trans);
                btrfs_sysfs_del_qgroups(fs_info);
        }
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
+       if (ret && trans)
+               btrfs_end_transaction(trans);
+       else if (trans)
+               ret = btrfs_end_transaction(trans);
+       ulist_free(ulist);
        return ret;
 }
 
@@ -1141,19 +1170,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root)
                goto out;
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
 
        /*
         * 1 For the root item
         *
         * We should also reserve enough items for the quota tree deletion in
         * btrfs_clean_quota_tree but this is not done.
+        *
+        * Also, we must always start a transaction without holding the mutex
+        * qgroup_ioctl_lock, see btrfs_quota_enable().
         */
        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
+               trans = NULL;
                goto out;
        }
 
+       if (!fs_info->quota_root)
+               goto out;
+
        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        btrfs_qgroup_wait_for_completion(fs_info, false);
        spin_lock(&fs_info->qgroup_lock);
@@ -1167,13 +1206,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
        ret = btrfs_clean_quota_tree(trans, quota_root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
-               goto end_trans;
+               goto out;
        }
 
        ret = btrfs_del_root(trans, &quota_root->root_key);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
-               goto end_trans;
+               goto out;
        }
 
        list_del(&quota_root->dirty_list);
@@ -1185,10 +1224,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 
        btrfs_put_root(quota_root);
 
-end_trans:
-       ret = btrfs_end_transaction(trans);
 out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
+       if (ret && trans)
+               btrfs_end_transaction(trans);
+       else if (trans)
+               ret = btrfs_end_transaction(trans);
+
        return ret;
 }
 
@@ -1324,13 +1366,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
        struct btrfs_qgroup *member;
        struct btrfs_qgroup_list *list;
        struct ulist *tmp;
+       unsigned int nofs_flag;
        int ret = 0;
 
        /* Check the level of src and dst first */
        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
                return -EINVAL;
 
+       /* We hold a transaction handle open, must do a NOFS allocation. */
+       nofs_flag = memalloc_nofs_save();
        tmp = ulist_alloc(GFP_KERNEL);
+       memalloc_nofs_restore(nofs_flag);
        if (!tmp)
                return -ENOMEM;
 
@@ -1387,10 +1433,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
        struct btrfs_qgroup_list *list;
        struct ulist *tmp;
        bool found = false;
+       unsigned int nofs_flag;
        int ret = 0;
        int ret2;
 
+       /* We hold a transaction handle open, must do a NOFS allocation. */
+       nofs_flag = memalloc_nofs_save();
        tmp = ulist_alloc(GFP_KERNEL);
+       memalloc_nofs_restore(nofs_flag);
        if (!tmp)
                return -ENOMEM;
 
@@ -3512,6 +3562,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
 {
        struct btrfs_trans_handle *trans;
        int ret;
+       bool can_commit = true;
 
        /*
         * We don't want to run flush again and again, so if there is a running
@@ -3523,6 +3574,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
                return 0;
        }
 
+       /*
+        * If current process holds a transaction, we shouldn't flush, as we
+        * assume all space reservation happens before a transaction handle is
+        * held.
+        *
+        * But there are cases like btrfs_delayed_item_reserve_metadata() where
+        * we try to reserve space with one transction handle already held.
+        * In that case we can't commit transaction, but at least try to end it
+        * and hope the started data writes can free some space.
+        */
+       if (current->journal_info &&
+           current->journal_info != BTRFS_SEND_TRANS_STUB)
+               can_commit = false;
+
        ret = btrfs_start_delalloc_snapshot(root);
        if (ret < 0)
                goto out;
@@ -3534,7 +3599,10 @@ static int try_flush_qgroup(struct btrfs_root *root)
                goto out;
        }
 
-       ret = btrfs_commit_transaction(trans);
+       if (can_commit)
+               ret = btrfs_commit_transaction(trans);
+       else
+               ret = btrfs_end_transaction(trans);
 out:
        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
        wake_up(&root->qgroup_flush_wait);
index e6719f7db386a781565158782969c56865135809..04022069761deb90a2651d9eed9a94469268eb92 100644 (file)
@@ -983,7 +983,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
                               BTRFS_MAX_EXTENT_SIZE >> 1,
                               (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
-                              EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+                              EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+                              EXTENT_UPTODATE, 0, 0, NULL);
        if (ret) {
                test_err("clear_extent_bit returned %d", ret);
                goto out;
@@ -1050,7 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
                               BTRFS_MAX_EXTENT_SIZE + sectorsize,
                               BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
-                              EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+                              EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+                              EXTENT_UPTODATE, 0, 0, NULL);
        if (ret) {
                test_err("clear_extent_bit returned %d", ret);
                goto out;
@@ -1082,7 +1084,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 
        /* Empty */
        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
-                              EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+                              EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+                              EXTENT_UPTODATE, 0, 0, NULL);
        if (ret) {
                test_err("clear_extent_bit returned %d", ret);
                goto out;
@@ -1097,7 +1100,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 out:
        if (ret)
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
-                                EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+                                EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+                                EXTENT_UPTODATE, 0, 0, NULL);
        iput(inode);
        btrfs_free_dummy_root(root);
        btrfs_free_dummy_fs_info(fs_info);
index 8784b74f5232e797ceacede9c31bf19e64c9247d..ea2bb4cb58909be561f614ee01bba24994e5ff21 100644 (file)
@@ -1068,6 +1068,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
                            "invalid root item size, have %u expect %zu or %u",
                            btrfs_item_size_nr(leaf, slot), sizeof(ri),
                            btrfs_legacy_root_item_size());
+               return -EUCLEAN;
        }
 
        /*
@@ -1423,6 +1424,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
        "invalid item size, have %u expect aligned to %zu for key type %u",
                            btrfs_item_size_nr(leaf, slot),
                            sizeof(*dref), key->type);
+               return -EUCLEAN;
        }
        if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
                generic_err(leaf, slot,
@@ -1451,6 +1453,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
                        extent_err(leaf, slot,
        "invalid extent data backref offset, have %llu expect aligned to %u",
                                   offset, leaf->fs_info->sectorsize);
+                       return -EUCLEAN;
                }
        }
        return 0;
index a6406b3b8c2b4f2e27b7245e39f306a0d5a909ac..78637665166e05cdbbd4cb8e110bccc1f6fefdf4 100644 (file)
@@ -940,7 +940,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
                        if (device->bdev != path_bdev) {
                                bdput(path_bdev);
                                mutex_unlock(&fs_devices->device_list_mutex);
-                               btrfs_warn_in_rcu(device->fs_info,
+                               /*
+                                * device->fs_info may not be reliable here, so
+                                * pass in a NULL instead. This avoids a
+                                * possible use-after-free when the fs_info and
+                                * fs_info->sb are already torn down.
+                                */
+                               btrfs_warn_in_rcu(NULL,
        "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
                                                  path, devid, found_transid,
                                                  current->comm,