btrfs: Fix delalloc inodes invalidation during transaction abort
[sfrench/cifs-2.6.git] / fs / btrfs / disk-io.c
index 21f34ad0d41129d1829105c08ccbc6c5c74ddac8..c3504b4d281b5cd76bb0861b781e228bcec4b3bb 100644 (file)
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
 #include <linux/error-injection.h>
+#include <linux/crc32c.h>
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
@@ -110,7 +97,7 @@ int __init btrfs_end_io_wq_init(void)
        return 0;
 }
 
-void btrfs_end_io_wq_exit(void)
+void __cold btrfs_end_io_wq_exit(void)
 {
        kmem_cache_destroy(btrfs_end_io_wq_cache);
 }
@@ -124,8 +111,8 @@ struct async_submit_bio {
        void *private_data;
        struct btrfs_fs_info *fs_info;
        struct bio *bio;
-       extent_submit_bio_hook_t *submit_bio_start;
-       extent_submit_bio_hook_t *submit_bio_done;
+       extent_submit_bio_start_t *submit_bio_start;
+       extent_submit_bio_done_t *submit_bio_done;
        int mirror_num;
        unsigned long bio_flags;
        /*
@@ -270,7 +257,7 @@ out:
 
 u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
 {
-       return btrfs_crc32c(seed, data, len);
+       return crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, u8 *result)
@@ -403,8 +390,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 
        if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
                u32 crc = ~(u32)0;
-               const int csum_size = sizeof(crc);
-               char result[csum_size];
+               char result[sizeof(crc)];
 
                /*
                 * The super_block structure does not span the whole
@@ -415,7 +401,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
                                crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
                btrfs_csum_final(crc, result);
 
-               if (memcmp(raw_disk_sb, result, csum_size))
+               if (memcmp(raw_disk_sb, result, sizeof(result)))
                        ret = 1;
        }
 
@@ -428,13 +414,67 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
        return ret;
 }
 
+static int verify_level_key(struct btrfs_fs_info *fs_info,
+                           struct extent_buffer *eb, int level,
+                           struct btrfs_key *first_key)
+{
+       int found_level;
+       struct btrfs_key found_key;
+       int ret;
+
+       found_level = btrfs_header_level(eb);
+       if (found_level != level) {
+#ifdef CONFIG_BTRFS_DEBUG
+               WARN_ON(1);
+               btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+                         eb->start, level, found_level);
+#endif
+               return -EIO;
+       }
+
+       if (!first_key)
+               return 0;
+
+       /*
+        * For live tree block (new tree blocks in current transaction),
+        * we need proper lock context to avoid race, which is impossible here.
+        * So we only checks tree blocks which is read from disk, whose
+        * generation <= fs_info->last_trans_committed.
+        */
+       if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+               return 0;
+       if (found_level)
+               btrfs_node_key_to_cpu(eb, &found_key, 0);
+       else
+               btrfs_item_key_to_cpu(eb, &found_key, 0);
+       ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+#ifdef CONFIG_BTRFS_DEBUG
+       if (ret) {
+               WARN_ON(1);
+               btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
+                         eb->start, first_key->objectid, first_key->type,
+                         first_key->offset, found_key.objectid,
+                         found_key.type, found_key.offset);
+       }
+#endif
+       return ret;
+}
+
 /*
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid:    expected transid, skip check if 0
+ * @level:             expected level, mandatory check
+ * @first_key:         expected key of first slot, skip check if NULL
  */
 static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                                          struct extent_buffer *eb,
-                                         u64 parent_transid)
+                                         u64 parent_transid, int level,
+                                         struct btrfs_key *first_key)
 {
        struct extent_io_tree *io_tree;
        int failed = 0;
@@ -449,11 +489,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
                                               mirror_num);
                if (!ret) {
-                       if (!verify_parent_transid(io_tree, eb,
+                       if (verify_parent_transid(io_tree, eb,
                                                   parent_transid, 0))
-                               break;
-                       else
                                ret = -EIO;
+                       else if (verify_level_key(fs_info, eb, level,
+                                                 first_key))
+                               ret = -EUCLEAN;
+                       else
+                               break;
                }
 
                /*
@@ -461,7 +504,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                 * there is no reason to read the other copies, they won't be
                 * any less wrong.
                 */
-               if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+               if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
+                   ret == -EUCLEAN)
                        break;
 
                num_copies = btrfs_num_copies(fs_info,
@@ -602,12 +646,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
         * that we don't try and read the other copies of this block, just
         * return -EIO.
         */
-       if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+       if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
                ret = -EIO;
        }
 
-       if (found_level > 0 && btrfs_check_node(root, eb))
+       if (found_level > 0 && btrfs_check_node(fs_info, eb))
                ret = -EIO;
 
        if (!ret)
@@ -710,14 +754,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
        return 0;
 }
 
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
-{
-       unsigned long limit = min_t(unsigned long,
-                                   info->thread_pool_size,
-                                   info->fs_devices->open_devices);
-       return 256 * limit;
-}
-
 static void run_one_async_start(struct btrfs_work *work)
 {
        struct async_submit_bio *async;
@@ -725,7 +761,6 @@ static void run_one_async_start(struct btrfs_work *work)
 
        async = container_of(work, struct  async_submit_bio, work);
        ret = async->submit_bio_start(async->private_data, async->bio,
-                                     async->mirror_num, async->bio_flags,
                                      async->bio_offset);
        if (ret)
                async->status = ret;
@@ -744,8 +779,7 @@ static void run_one_async_done(struct btrfs_work *work)
                return;
        }
 
-       async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
-                              async->bio_flags, async->bio_offset);
+       async->submit_bio_done(async->private_data, async->bio, async->mirror_num);
 }
 
 static void run_one_async_free(struct btrfs_work *work)
@@ -759,8 +793,8 @@ static void run_one_async_free(struct btrfs_work *work)
 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset, void *private_data,
-                                extent_submit_bio_hook_t *submit_bio_start,
-                                extent_submit_bio_hook_t *submit_bio_done)
+                                extent_submit_bio_start_t *submit_bio_start,
+                                extent_submit_bio_done_t *submit_bio_done)
 {
        struct async_submit_bio *async;
 
@@ -807,8 +841,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
        return errno_to_blk_status(ret);
 }
 
-static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
-                                            int mirror_num, unsigned long bio_flags,
+static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
                                             u64 bio_offset)
 {
        /*
@@ -818,9 +851,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
        return btree_csum_one_bio(bio);
 }
 
-static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
-                                           int mirror_num, unsigned long bio_flags,
-                                           u64 bio_offset)
+static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio,
+                                           int mirror_num)
 {
        struct inode *inode = private_data;
        blk_status_t ret;
@@ -879,8 +911,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
                 */
                ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
                                          bio_offset, private_data,
-                                         __btree_submit_bio_start,
-                                         __btree_submit_bio_done);
+                                         btree_submit_bio_start,
+                                         btree_submit_bio_done);
        }
 
        if (ret)
@@ -1062,8 +1094,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
                                buf->start, buf->start + buf->len - 1);
 }
 
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @parent_transid:    expected transid of this tree block, skip check if 0
+ * @level:             expected level, mandatory check
+ * @first_key:         expected key in slot 0, skip check if NULL
+ */
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
-                                     u64 parent_transid)
+                                     u64 parent_transid, int level,
+                                     struct btrfs_key *first_key)
 {
        struct extent_buffer *buf = NULL;
        int ret;
@@ -1072,7 +1113,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
        if (IS_ERR(buf))
                return buf;
 
-       ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+       ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+                                            level, first_key);
        if (ret) {
                free_extent_buffer(buf);
                return ERR_PTR(ret);
@@ -1108,7 +1150,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
        if (!writers)
                return ERR_PTR(-ENOMEM);
 
-       ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+       ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
        if (ret < 0) {
                kfree(writers);
                return ERR_PTR(ret);
@@ -1160,6 +1202,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->log_extents_lock[0]);
        spin_lock_init(&root->log_extents_lock[1]);
+       spin_lock_init(&root->qgroup_meta_rsv_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        mutex_init(&root->ordered_extent_mutex);
@@ -1176,7 +1219,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        atomic_set(&root->orphan_inodes, 0);
        refcount_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshotted, 0);
-       atomic64_set(&root->qgroup_meta_rsv, 0);
        root->log_transid = 0;
        root->log_transid_committed = -1;
        root->last_log_commit = 0;
@@ -1401,6 +1443,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        struct btrfs_path *path;
        u64 generation;
        int ret;
+       int level;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -1423,9 +1466,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        }
 
        generation = btrfs_root_generation(&root->root_item);
+       level = btrfs_root_level(&root->root_item);
        root->node = read_tree_block(fs_info,
                                     btrfs_root_bytenr(&root->root_item),
-                                    generation);
+                                    generation, level, NULL);
        if (IS_ERR(root->node)) {
                ret = PTR_ERR(root->node);
                goto find_fail;
@@ -1780,6 +1824,7 @@ static int transaction_kthread(void *arg)
 
                now = get_seconds();
                if (cur->state < TRANS_STATE_BLOCKED &&
+                   !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
                    (now < cur->start_time ||
                     now - cur->start_time < fs_info->commit_interval)) {
                        spin_unlock(&fs_info->trans_lock);
@@ -1808,12 +1853,10 @@ sleep:
                if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
                                      &fs_info->fs_state)))
                        btrfs_cleanup_transaction(fs_info);
-               set_current_state(TASK_INTERRUPTIBLE);
                if (!kthread_should_stop() &&
                                (!btrfs_transaction_blocked(fs_info) ||
                                 cannot_commit))
-                       schedule_timeout(delay);
-               __set_current_state(TASK_RUNNING);
+                       schedule_timeout_interruptible(delay);
        } while (!kthread_should_stop());
        return 0;
 }
@@ -2183,7 +2226,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
                struct btrfs_fs_devices *fs_devices)
 {
-       int max_active = fs_info->thread_pool_size;
+       u32 max_active = fs_info->thread_pool_size;
        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 
        fs_info->workers =
@@ -2276,6 +2319,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
        struct btrfs_root *log_tree_root;
        struct btrfs_super_block *disk_super = fs_info->super_copy;
        u64 bytenr = btrfs_super_log_root(disk_super);
+       int level = btrfs_super_log_root_level(disk_super);
 
        if (fs_devices->rw_devices == 0) {
                btrfs_warn(fs_info, "log replay required on RO media");
@@ -2289,7 +2333,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
        __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
        log_tree_root->node = read_tree_block(fs_info, bytenr,
-                                             fs_info->generation + 1);
+                                             fs_info->generation + 1,
+                                             level, NULL);
        if (IS_ERR(log_tree_root->node)) {
                btrfs_warn(fs_info, "failed to read log tree");
                ret = PTR_ERR(log_tree_root->node);
@@ -2334,23 +2379,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        location.offset = 0;
 
        root = btrfs_read_tree_root(tree_root, &location);
-       if (IS_ERR(root))
-               return PTR_ERR(root);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
        fs_info->extent_root = root;
 
        location.objectid = BTRFS_DEV_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
-       if (IS_ERR(root))
-               return PTR_ERR(root);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
        fs_info->dev_root = root;
        btrfs_init_devices_late(fs_info);
 
        location.objectid = BTRFS_CSUM_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
-       if (IS_ERR(root))
-               return PTR_ERR(root);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
        fs_info->csum_root = root;
 
@@ -2367,7 +2418,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
                if (ret != -ENOENT)
-                       return ret;
+                       goto out;
        } else {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->uuid_root = root;
@@ -2376,13 +2427,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
                root = btrfs_read_tree_root(tree_root, &location);
-               if (IS_ERR(root))
-                       return PTR_ERR(root);
+               if (IS_ERR(root)) {
+                       ret = PTR_ERR(root);
+                       goto out;
+               }
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->free_space_root = root;
        }
 
        return 0;
+out:
+       btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+                  location.objectid, ret);
+       return ret;
 }
 
 int open_ctree(struct super_block *sb,
@@ -2404,8 +2461,8 @@ int open_ctree(struct super_block *sb,
        int err = -EINVAL;
        int num_backups_tried = 0;
        int backup_index = 0;
-       int max_active;
        int clear_free_space_tree = 0;
+       int level;
 
        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2447,6 +2504,8 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
+       INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
+       spin_lock_init(&fs_info->pending_raid_kobjs_lock);
        spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2713,8 +2772,6 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
 
-       max_active = fs_info->thread_pool_size;
-
        ret = btrfs_init_workqueues(fs_info, fs_devices);
        if (ret) {
                err = ret;
@@ -2741,12 +2798,13 @@ int open_ctree(struct super_block *sb,
        }
 
        generation = btrfs_super_chunk_root_generation(disk_super);
+       level = btrfs_super_chunk_root_level(disk_super);
 
        __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
        chunk_root->node = read_tree_block(fs_info,
                                           btrfs_super_chunk_root(disk_super),
-                                          generation);
+                                          generation, level, NULL);
        if (IS_ERR(chunk_root->node) ||
            !extent_buffer_uptodate(chunk_root->node)) {
                btrfs_err(fs_info, "failed to read chunk root");
@@ -2768,10 +2826,10 @@ int open_ctree(struct super_block *sb,
        }
 
        /*
-        * keep the device that is marked to be the target device for the
-        * dev_replace procedure
+        * Keep the devid that is marked to be the target device for the
+        * device replace procedure
         */
-       btrfs_close_extra_devices(fs_devices, 0);
+       btrfs_free_extra_devids(fs_devices, 0);
 
        if (!fs_devices->latest_bdev) {
                btrfs_err(fs_info, "failed to read devices");
@@ -2780,10 +2838,11 @@ int open_ctree(struct super_block *sb,
 
 retry_root_backup:
        generation = btrfs_super_generation(disk_super);
+       level = btrfs_super_root_level(disk_super);
 
        tree_root->node = read_tree_block(fs_info,
                                          btrfs_super_root(disk_super),
-                                         generation);
+                                         generation, level, NULL);
        if (IS_ERR(tree_root->node) ||
            !extent_buffer_uptodate(tree_root->node)) {
                btrfs_warn(fs_info, "failed to read tree root");
@@ -2834,7 +2893,7 @@ retry_root_backup:
                goto fail_block_groups;
        }
 
-       btrfs_close_extra_devices(fs_devices, 1);
+       btrfs_free_extra_devids(fs_devices, 1);
 
        ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
        if (ret) {
@@ -2953,6 +3012,7 @@ retry_root_backup:
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(fs_info->fs_root)) {
                err = PTR_ERR(fs_info->fs_root);
+               btrfs_warn(fs_info, "failed to read fs tree: %d", err);
                goto fail_qgroup;
        }
 
@@ -3290,6 +3350,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
        struct buffer_head *bh;
        int i;
        int errors = 0;
+       bool primary_failed = false;
        u64 bytenr;
 
        if (max_mirrors == 0)
@@ -3306,11 +3367,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
                                      BTRFS_SUPER_INFO_SIZE);
                if (!bh) {
                        errors++;
+                       if (i == 0)
+                               primary_failed = true;
                        continue;
                }
                wait_on_buffer(bh);
-               if (!buffer_uptodate(bh))
+               if (!buffer_uptodate(bh)) {
                        errors++;
+                       if (i == 0)
+                               primary_failed = true;
+               }
 
                /* drop our reference */
                brelse(bh);
@@ -3319,6 +3385,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
                brelse(bh);
        }
 
+       /* log error, force error return */
+       if (primary_failed) {
+               btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+                         device->devid);
+               return -1;
+       }
+
        return errors < i ? 0 : -1;
 }
 
@@ -3735,7 +3808,8 @@ void close_ctree(struct btrfs_fs_info *fs_info)
                        btrfs_err(fs_info, "commit super ret %d", ret);
        }
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
+           test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
                btrfs_error_commit_super(fs_info);
 
        kthread_stop(fs_info->transaction_kthread);
@@ -3744,6 +3818,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
        set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
 
        btrfs_free_qgroup_config(fs_info);
+       ASSERT(list_empty(&fs_info->delalloc_roots));
 
        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
                btrfs_info(fs_info, "at unmount delalloc count %lld",
@@ -3851,7 +3926,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
         * So here we should only check item pointers, not item data.
         */
        if (btrfs_header_level(buf) == 0 &&
-           btrfs_check_leaf_relaxed(root, buf)) {
+           btrfs_check_leaf_relaxed(fs_info, buf)) {
                btrfs_print_leaf(buf);
                ASSERT(0);
        }
@@ -3890,12 +3965,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
        __btrfs_btree_balance_dirty(fs_info, 0);
 }
 
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+                     struct btrfs_key *first_key)
 {
        struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+       return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+                                             level, first_key);
 }
 
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
@@ -4049,15 +4126,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
 
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
 {
+       /* cleanup FS via transaction */
+       btrfs_cleanup_transaction(fs_info);
+
        mutex_lock(&fs_info->cleaner_mutex);
        btrfs_run_delayed_iputs(fs_info);
        mutex_unlock(&fs_info->cleaner_mutex);
 
        down_write(&fs_info->cleanup_work_sem);
        up_write(&fs_info->cleanup_work_sem);
-
-       /* cleanup FS via transaction */
-       btrfs_cleanup_transaction(fs_info);
 }
 
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
@@ -4182,19 +4259,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
        list_splice_init(&root->delalloc_inodes, &splice);
 
        while (!list_empty(&splice)) {
+               struct inode *inode = NULL;
                btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
                                               delalloc_inodes);
-
-               list_del_init(&btrfs_inode->delalloc_inodes);
-               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                         &btrfs_inode->runtime_flags);
+               __btrfs_del_delalloc_inode(root, btrfs_inode);
                spin_unlock(&root->delalloc_lock);
 
-               btrfs_invalidate_inodes(btrfs_inode->root);
-
+               /*
+                * Make sure we get a live inode and that it'll not disappear
+                * meanwhile.
+                */
+               inode = igrab(&btrfs_inode->vfs_inode);
+               if (inode) {
+                       invalidate_inode_pages2(inode->i_mapping);
+                       iput(inode);
+               }
                spin_lock(&root->delalloc_lock);
        }
-
        spin_unlock(&root->delalloc_lock);
 }
 
@@ -4210,7 +4291,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
        while (!list_empty(&splice)) {
                root = list_first_entry(&splice, struct btrfs_root,
                                         delalloc_root);
-               list_del_init(&root->delalloc_root);
                root = btrfs_grab_fs_root(root);
                BUG_ON(!root);
                spin_unlock(&fs_info->delalloc_root_lock);
@@ -4314,11 +4394,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
                                         dirty_list);
-               if (!cache) {
-                       btrfs_err(fs_info, "orphan block group dirty_bgs list");
-                       spin_unlock(&cur_trans->dirty_bgs_lock);
-                       return;
-               }
 
                if (!list_empty(&cache->io_list)) {
                        spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4338,14 +4413,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
 
+       /*
+        * Refer to the definition of io_bgs member for details why it's safe
+        * to use it without any locking
+        */
        while (!list_empty(&cur_trans->io_bgs)) {
                cache = list_first_entry(&cur_trans->io_bgs,
                                         struct btrfs_block_group_cache,
                                         io_list);
-               if (!cache) {
-                       btrfs_err(fs_info, "orphan block group on io_bgs list");
-                       return;
-               }
 
                list_del_init(&cache->io_list);
                spin_lock(&cache->lock);