btrfs: fix insert_reserved error handling
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
index 2d9074295d7f0bb327010feeccffb1630c455477..a1febf155747e05c21212b65bf2d3ff758f37868 100644 (file)
@@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                           insert_reserved);
        else
                BUG();
+       if (ret && insert_reserved)
+               btrfs_pin_extent(trans->fs_info, node->bytenr,
+                                node->num_bytes, 1);
        return ret;
 }
 
@@ -2374,7 +2377,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
        struct btrfs_delayed_ref_node *ref;
 
-       if (RB_EMPTY_ROOT(&head->ref_tree))
+       if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
                return NULL;
 
        /*
@@ -2387,7 +2390,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
                return list_first_entry(&head->ref_add_list,
                                struct btrfs_delayed_ref_node, add_list);
 
-       ref = rb_entry(rb_first(&head->ref_tree),
+       ref = rb_entry(rb_first_cached(&head->ref_tree),
                       struct btrfs_delayed_ref_node, ref_node);
        ASSERT(list_empty(&ref->add_list));
        return ref;
@@ -2448,13 +2451,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
        spin_unlock(&head->lock);
        spin_lock(&delayed_refs->lock);
        spin_lock(&head->lock);
-       if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+       if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
                spin_unlock(&head->lock);
                spin_unlock(&delayed_refs->lock);
                return 1;
        }
        delayed_refs->num_heads--;
-       rb_erase(&head->href_node, &delayed_refs->href_root);
+       rb_erase_cached(&head->href_node, &delayed_refs->href_root);
        RB_CLEAR_NODE(&head->href_node);
        spin_unlock(&head->lock);
        spin_unlock(&delayed_refs->lock);
@@ -2502,102 +2505,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-/*
- * Returns 0 on success or if called with an already aborted transaction.
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
- */
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                                            unsigned long nr)
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+                                       struct btrfs_trans_handle *trans)
+{
+       struct btrfs_delayed_ref_root *delayed_refs =
+               &trans->transaction->delayed_refs;
+       struct btrfs_delayed_ref_head *head = NULL;
+       int ret;
+
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_select_ref_head(delayed_refs);
+       if (!head) {
+               spin_unlock(&delayed_refs->lock);
+               return head;
+       }
+
+       /*
+        * Grab the lock that says we are going to process all the refs for
+        * this head
+        */
+       ret = btrfs_delayed_ref_lock(delayed_refs, head);
+       spin_unlock(&delayed_refs->lock);
+
+       /*
+        * We may have dropped the spin lock to get the head mutex lock, and
+        * that might have given someone else time to free the head.  If that's
+        * true, it has been removed from our list and we can move on.
+        */
+       if (ret == -EAGAIN)
+               head = ERR_PTR(-EAGAIN);
+
+       return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+                                   struct btrfs_delayed_ref_head *locked_ref,
+                                   unsigned long *run_refs)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
-       struct btrfs_delayed_ref_node *ref;
-       struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
-       ktime_t start = ktime_get();
-       int ret;
-       unsigned long count = 0;
-       unsigned long actual_count = 0;
+       struct btrfs_delayed_ref_node *ref;
        int must_insert_reserved = 0;
+       int ret;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       while (1) {
-               if (!locked_ref) {
-                       if (count >= nr)
-                               break;
 
-                       spin_lock(&delayed_refs->lock);
-                       locked_ref = btrfs_select_ref_head(trans);
-                       if (!locked_ref) {
-                               spin_unlock(&delayed_refs->lock);
-                               break;
-                       }
-
-                       /* grab the lock that says we are going to process
-                        * all the refs for this head */
-                       ret = btrfs_delayed_ref_lock(trans, locked_ref);
-                       spin_unlock(&delayed_refs->lock);
-                       /*
-                        * we may have dropped the spin lock to get the head
-                        * mutex lock, and that might have given someone else
-                        * time to free the head.  If that's true, it has been
-                        * removed from our list and we can move on.
-                        */
-                       if (ret == -EAGAIN) {
-                               locked_ref = NULL;
-                               count++;
-                               continue;
-                       }
-               }
+       lockdep_assert_held(&locked_ref->mutex);
+       lockdep_assert_held(&locked_ref->lock);
 
-               /*
-                * We need to try and merge add/drops of the same ref since we
-                * can run into issues with relocate dropping the implicit ref
-                * and then it being added back again before the drop can
-                * finish.  If we merged anything we need to re-loop so we can
-                * get a good ref.
-                * Or we can get node references of the same type that weren't
-                * merged when created due to bumps in the tree mod seq, and
-                * we need to merge them to prevent adding an inline extent
-                * backref before dropping it (triggering a BUG_ON at
-                * insert_inline_extent_backref()).
-                */
-               spin_lock(&locked_ref->lock);
-               btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-
-               ref = select_delayed_ref(locked_ref);
-
-               if (ref && ref->seq &&
+       while ((ref = select_delayed_ref(locked_ref))) {
+               if (ref->seq &&
                    btrfs_check_delayed_seq(fs_info, ref->seq)) {
                        spin_unlock(&locked_ref->lock);
                        unselect_delayed_ref_head(delayed_refs, locked_ref);
-                       locked_ref = NULL;
-                       cond_resched();
-                       count++;
-                       continue;
-               }
-
-               /*
-                * We're done processing refs in this ref_head, clean everything
-                * up and move on to the next ref_head.
-                */
-               if (!ref) {
-                       ret = cleanup_ref_head(trans, locked_ref);
-                       if (ret > 0 ) {
-                               /* We dropped our lock, we need to loop. */
-                               ret = 0;
-                               continue;
-                       } else if (ret) {
-                               return ret;
-                       }
-                       locked_ref = NULL;
-                       count++;
-                       continue;
+                       return -EAGAIN;
                }
 
-               actual_count++;
+               (*run_refs)++;
                ref->in_tree = 0;
-               rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+               rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
                RB_CLEAR_NODE(&ref->ref_node);
                if (!list_empty(&ref->add_list))
                        list_del(&ref->add_list);
@@ -2619,8 +2586,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                atomic_dec(&delayed_refs->num_entries);
 
                /*
-                * Record the must-insert_reserved flag before we drop the spin
-                * lock.
+                * Record the must_insert_reserved flag before we drop the
+                * spin lock.
                 */
                must_insert_reserved = locked_ref->must_insert_reserved;
                locked_ref->must_insert_reserved = 0;
@@ -2642,10 +2609,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                }
 
                btrfs_put_delayed_ref(ref);
-               count++;
                cond_resched();
+
+               spin_lock(&locked_ref->lock);
+               btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
        }
 
+       return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                                            unsigned long nr)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_head *locked_ref = NULL;
+       ktime_t start = ktime_get();
+       int ret;
+       unsigned long count = 0;
+       unsigned long actual_count = 0;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       do {
+               if (!locked_ref) {
+                       locked_ref = btrfs_obtain_ref_head(trans);
+                       if (IS_ERR_OR_NULL(locked_ref)) {
+                               if (PTR_ERR(locked_ref) == -EAGAIN) {
+                                       continue;
+                               } else {
+                                       break;
+                               }
+                       }
+                       count++;
+               }
+               /*
+                * We need to try and merge add/drops of the same ref since we
+                * can run into issues with relocate dropping the implicit ref
+                * and then it being added back again before the drop can
+                * finish.  If we merged anything we need to re-loop so we can
+                * get a good ref.
+                * Or we can get node references of the same type that weren't
+                * merged when created due to bumps in the tree mod seq, and
+                * we need to merge them to prevent adding an inline extent
+                * backref before dropping it (triggering a BUG_ON at
+                * insert_inline_extent_backref()).
+                */
+               spin_lock(&locked_ref->lock);
+               btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+               ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+                                                     &actual_count);
+               if (ret < 0 && ret != -EAGAIN) {
+                       /*
+                        * Error, btrfs_run_delayed_refs_for_head already
+                        * unlocked everything so just bail out
+                        */
+                       return ret;
+               } else if (!ret) {
+                       /*
+                        * Success, perform the usual cleanup of a processed
+                        * head
+                        */
+                       ret = cleanup_ref_head(trans, locked_ref);
+                       if (ret > 0 ) {
+                               /* We dropped our lock, we need to loop. */
+                               ret = 0;
+                               continue;
+                       } else if (ret) {
+                               return ret;
+                       }
+               }
+
+               /*
+                * Either success case or btrfs_run_delayed_refs_for_head
+                * returned -EAGAIN, meaning we need to select another head
+                */
+
+               locked_ref = NULL;
+               cond_resched();
+       } while ((nr != -1 && count < nr) || locked_ref);
+
        /*
         * We don't want to include ref heads since we can have empty ref heads
         * and those will drastically skew our runtime down since we just do
@@ -2745,9 +2792,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
        return num_csums;
 }
 
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info)
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_rsv *global_rsv;
        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
        u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
@@ -2782,8 +2829,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info)
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
 {
        u64 num_entries =
                atomic_read(&trans->transaction->delayed_refs.num_entries);
@@ -2791,14 +2837,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
        u64 val;
 
        smp_mb();
-       avg_runtime = fs_info->avg_delayed_ref_runtime;
+       avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
        val = num_entries * avg_runtime;
        if (val >= NSEC_PER_SEC)
                return 1;
        if (val >= NSEC_PER_SEC / 2)
                return 2;
 
-       return btrfs_check_space_for_delayed_refs(trans, fs_info);
+       return btrfs_check_space_for_delayed_refs(trans);
 }
 
 struct async_delayed_refs {
@@ -2911,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *head;
        int ret;
        int run_all = count == (unsigned long)-1;
-       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2928,7 +2973,6 @@ again:
 #ifdef SCRAMBLE_DELAYED_REFS
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
-       trans->can_flush_pending_bgs = false;
        ret = __btrfs_run_delayed_refs(trans, count);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
@@ -2940,7 +2984,7 @@ again:
                        btrfs_create_pending_block_groups(trans);
 
                spin_lock(&delayed_refs->lock);
-               node = rb_first(&delayed_refs->href_root);
+               node = rb_first_cached(&delayed_refs->href_root);
                if (!node) {
                        spin_unlock(&delayed_refs->lock);
                        goto out;
@@ -2959,7 +3003,6 @@ again:
                goto again;
        }
 out:
-       trans->can_flush_pending_bgs = can_flush_pending_bgs;
        return 0;
 }
 
@@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
         * XXX: We should replace this with a proper search function in the
         * future.
         */
-       for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+       for (node = rb_first_cached(&head->ref_tree); node;
+            node = rb_next(node)) {
                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
                /* If it's a shared ref we know a cross reference exists */
                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
 {
        struct btrfs_path *path;
        int ret;
-       int ret2;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
                if (ret && ret != -ENOENT)
                        goto out;
 
-               ret2 = check_delayed_ref(root, path, objectid,
-                                        offset, bytenr);
-       } while (ret2 == -EAGAIN);
-
-       if (ret2 && ret2 != -ENOENT) {
-               ret = ret2;
-               goto out;
-       }
+               ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+       } while (ret == -EAGAIN);
 
-       if (ret != -ENOENT || ret2 != -ENOENT)
-               ret = 0;
 out:
        btrfs_free_path(path);
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -4533,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
                        goto out;
        } else {
                ret = 1;
+               space_info->max_extent_size = 0;
        }
 
        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4554,11 +4590,9 @@ out:
         * the block groups that were made dirty during the lifetime of the
         * transaction.
         */
-       if (trans->can_flush_pending_bgs &&
-           trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+       if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
                btrfs_create_pending_block_groups(trans);
-               btrfs_trans_release_chunk_metadata(trans);
-       }
+
        return ret;
 }
 
@@ -5284,7 +5318,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 }
 
 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes, int update_size)
+                               u64 num_bytes, bool update_size)
 {
        spin_lock(&block_rsv->lock);
        block_rsv->reserved += num_bytes;
@@ -5316,7 +5350,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
                global_rsv->full = 0;
        spin_unlock(&global_rsv->lock);
 
-       block_rsv_add_bytes(dest, num_bytes, 1);
+       block_rsv_add_bytes(dest, num_bytes, true);
        return 0;
 }
 
@@ -5479,7 +5513,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
                            struct btrfs_block_rsv *dst, u64 num_bytes,
-                           int update_size)
+                           bool update_size)
 {
        int ret;
 
@@ -5539,10 +5573,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
                return 0;
 
        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
-       if (!ret) {
-               block_rsv_add_bytes(block_rsv, num_bytes, 1);
-               return 0;
-       }
+       if (!ret)
+               block_rsv_add_bytes(block_rsv, num_bytes, true);
 
        return ret;
 }
@@ -5587,7 +5619,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 
        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
-               block_rsv_add_bytes(block_rsv, num_bytes, 0);
+               block_rsv_add_bytes(block_rsv, num_bytes, false);
                return 0;
        }
 
@@ -5629,7 +5661,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
                return ret;
        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
-               block_rsv_add_bytes(block_rsv, num_bytes, 0);
+               block_rsv_add_bytes(block_rsv, num_bytes, false);
                trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                              btrfs_ino(inode), num_bytes, 1);
 
@@ -5835,7 +5867,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                  BTRFS_RESERVE_FLUSH_ALL);
 
        if (ret == -ENOSPC && use_global_rsv)
-               ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
+               ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
 
        if (ret && qgroup_num_bytes)
                btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
@@ -6399,10 +6431,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
        } else {
                cache->reserved += num_bytes;
                space_info->bytes_reserved += num_bytes;
-
-               trace_btrfs_space_reservation(cache->fs_info,
-                               "space_info", space_info->flags,
-                               ram_bytes, 0);
                space_info->bytes_may_use -= ram_bytes;
                if (delalloc)
                        cache->delalloc_bytes += num_bytes;
@@ -6424,11 +6452,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
  * reserve set to 0 in order to clear the reservation.
  */
 
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                    u64 num_bytes, int delalloc)
+static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                     u64 num_bytes, int delalloc)
 {
        struct btrfs_space_info *space_info = cache->space_info;
-       int ret = 0;
 
        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);
@@ -6436,12 +6463,12 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
                space_info->bytes_readonly += num_bytes;
        cache->reserved -= num_bytes;
        space_info->bytes_reserved -= num_bytes;
+       space_info->max_extent_size = 0;
 
        if (delalloc)
                cache->delalloc_bytes -= num_bytes;
        spin_unlock(&cache->lock);
        spin_unlock(&space_info->lock);
-       return ret;
 }
 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
 {
@@ -6925,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
                goto out_delayed_unlock;
 
        spin_lock(&head->lock);
-       if (!RB_EMPTY_ROOT(&head->ref_tree))
+       if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
                goto out;
 
        if (head->extent_op) {
@@ -6946,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
         * at this point we have a head with no other entries.  Go
         * ahead and process it.
         */
-       rb_erase(&head->href_node, &delayed_refs->href_root);
+       rb_erase_cached(&head->href_node, &delayed_refs->href_root);
        RB_CLEAR_NODE(&head->href_node);
        atomic_dec(&delayed_refs->num_entries);
 
@@ -7233,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        struct btrfs_block_group_cache *block_group = NULL;
        u64 search_start = 0;
        u64 max_extent_size = 0;
+       u64 max_free_space = 0;
        u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
@@ -7528,8 +7556,8 @@ unclustered_alloc:
                        spin_lock(&ctl->tree_lock);
                        if (ctl->free_space <
                            num_bytes + empty_cluster + empty_size) {
-                               if (ctl->free_space > max_extent_size)
-                                       max_extent_size = ctl->free_space;
+                               max_free_space = max(max_free_space,
+                                                    ctl->free_space);
                                spin_unlock(&ctl->tree_lock);
                                goto loop;
                        }
@@ -7696,6 +7724,8 @@ loop:
        }
 out:
        if (ret == -ENOSPC) {
+               if (!max_extent_size)
+                       max_extent_size = max_free_space;
                spin_lock(&space_info->lock);
                space_info->max_extent_size = max_extent_size;
                spin_unlock(&space_info->lock);
@@ -7977,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        }
 
        path = btrfs_alloc_path();
-       if (!path) {
-               btrfs_free_and_pin_reserved_extent(fs_info,
-                                                  extent_key.objectid,
-                                                  fs_info->nodesize);
+       if (!path)
                return -ENOMEM;
-       }
 
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      &extent_key, size);
        if (ret) {
                btrfs_free_path(path);
-               btrfs_free_and_pin_reserved_extent(fs_info,
-                                                  extent_key.objectid,
-                                                  fs_info->nodesize);
                return ret;
        }
 
@@ -8119,6 +8142,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        if (IS_ERR(buf))
                return buf;
 
+       /*
+        * Extra safety check in case the extent tree is corrupted and extent
+        * allocator chooses to use a tree block which is already used and
+        * locked.
+        */
+       if (buf->lock_owner == current->pid) {
+               btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+                       buf->start, btrfs_header_owner(buf), current->pid);
+               free_extent_buffer(buf);
+               return ERR_PTR(-EUCLEAN);
+       }
+
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(fs_info, buf);
@@ -8215,7 +8251,7 @@ try_reserve:
 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
-       block_rsv_add_bytes(block_rsv, blocksize, 0);
+       block_rsv_add_bytes(block_rsv, blocksize, false);
        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
 }
 
@@ -8642,7 +8678,13 @@ skip:
                        parent = 0;
                }
 
-               if (need_account) {
+               /*
+                * Reloc tree doesn't contribute to qgroup numbers, and we have
+                * already accounted them at merge time (replace_path),
+                * thus we could skip expensive subtree trace here.
+                */
+               if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+                   need_account) {
                        ret = btrfs_qgroup_trace_subtree(trans, next,
                                                         generation, level - 1);
                        if (ret) {
@@ -8763,15 +8805,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (eb == root->node) {
                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                        parent = eb->start;
-               else
-                       BUG_ON(root->root_key.objectid !=
-                              btrfs_header_owner(eb));
+               else if (root->root_key.objectid != btrfs_header_owner(eb))
+                       goto owner_mismatch;
        } else {
                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                        parent = path->nodes[level + 1]->start;
-               else
-                       BUG_ON(root->root_key.objectid !=
-                              btrfs_header_owner(path->nodes[level + 1]));
+               else if (root->root_key.objectid !=
+                        btrfs_header_owner(path->nodes[level + 1]))
+                       goto owner_mismatch;
        }
 
        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8779,6 +8820,11 @@ out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
        return 0;
+
+owner_mismatch:
+       btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+                    btrfs_header_owner(eb), root->root_key.objectid);
+       return -EUCLEAN;
 }
 
 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -8832,6 +8878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                        ret = walk_up_proc(trans, root, path, wc);
                        if (ret > 0)
                                return 0;
+                       if (ret < 0)
+                               return ret;
 
                        if (path->locks[level]) {
                                btrfs_tree_unlock_rw(path->nodes[level],
@@ -8875,7 +8923,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int level;
        bool root_dropped = false;
 
-       btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
+       btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
 
        path = btrfs_alloc_path();
        if (!path) {
@@ -9613,6 +9661,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
 
                block_group = btrfs_lookup_first_block_group(info, last);
                while (block_group) {
+                       wait_block_group_cache_done(block_group);
                        spin_lock(&block_group->lock);
                        if (block_group->iref)
                                break;
@@ -10074,15 +10123,19 @@ error:
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
-       struct btrfs_block_group_cache *block_group, *tmp;
+       struct btrfs_block_group_cache *block_group;
        struct btrfs_root *extent_root = fs_info->extent_root;
        struct btrfs_block_group_item item;
        struct btrfs_key key;
        int ret = 0;
-       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 
-       trans->can_flush_pending_bgs = false;
-       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+       if (!trans->can_flush_pending_bgs)
+               return;
+
+       while (!list_empty(&trans->new_bgs)) {
+               block_group = list_first_entry(&trans->new_bgs,
+                                              struct btrfs_block_group_cache,
+                                              bg_list);
                if (ret)
                        goto next;
 
@@ -10103,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 next:
                list_del_init(&block_group->bg_list);
        }
-       trans->can_flush_pending_bgs = can_flush_pending_bgs;
+       btrfs_trans_release_chunk_metadata(trans);
 }
 
 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
@@ -10753,14 +10806,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
  * We don't want a transaction for this since the discard may take a
  * substantial amount of time.  We don't require that a transaction be
  * running, but we do need to take a running transaction into account
- * to ensure that we're not discarding chunks that were released in
- * the current transaction.
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
  *
  * Holding the chunks lock will prevent other threads from allocating
  * or releasing chunks, but it won't prevent a running transaction
  * from committing and releasing the memory that the pending chunks
  * list head uses.  For that, we need to take a reference to the
- * transaction.
+ * transaction and hold the commit root sem.  We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
  */
 static int btrfs_trim_free_extents(struct btrfs_device *device,
                                   u64 minlen, u64 *trimmed)
@@ -10770,6 +10825,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 
        *trimmed = 0;
 
+       /* Discard not supported = nothing to do. */
+       if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+               return 0;
+
        /* Not writeable = nothing to do. */
        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                return 0;
@@ -10787,9 +10846,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 
                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
                if (ret)
-                       return ret;
+                       break;
 
-               down_read(&fs_info->commit_root_sem);
+               ret = down_read_killable(&fs_info->commit_root_sem);
+               if (ret) {
+                       mutex_unlock(&fs_info->chunk_mutex);
+                       break;
+               }
 
                spin_lock(&fs_info->trans_lock);
                trans = fs_info->running_transaction;
@@ -10797,13 +10860,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
                        refcount_inc(&trans->use_count);
                spin_unlock(&fs_info->trans_lock);
 
+               if (!trans)
+                       up_read(&fs_info->commit_root_sem);
+
                ret = find_free_dev_extent_start(trans, device, minlen, start,
                                                 &start, &len);
-               if (trans)
+               if (trans) {
+                       up_read(&fs_info->commit_root_sem);
                        btrfs_put_transaction(trans);
+               }
 
                if (ret) {
-                       up_read(&fs_info->commit_root_sem);
                        mutex_unlock(&fs_info->chunk_mutex);
                        if (ret == -ENOSPC)
                                ret = 0;
@@ -10811,7 +10878,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
                }
 
                ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
-               up_read(&fs_info->commit_root_sem);
                mutex_unlock(&fs_info->chunk_mutex);
 
                if (ret)
@@ -10831,6 +10897,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
        return ret;
 }
 
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error.  The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 {
        struct btrfs_block_group_cache *cache = NULL;
@@ -10840,18 +10915,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
        u64 start;
        u64 end;
        u64 trimmed = 0;
-       u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+       u64 bg_failed = 0;
+       u64 dev_failed = 0;
+       int bg_ret = 0;
+       int dev_ret = 0;
        int ret = 0;
 
-       /*
-        * try to trim all FS space, our block group may start from non-zero.
-        */
-       if (range->len == total_bytes)
-               cache = btrfs_lookup_first_block_group(fs_info, range->start);
-       else
-               cache = btrfs_lookup_block_group(fs_info, range->start);
-
-       while (cache) {
+       cache = btrfs_lookup_first_block_group(fs_info, range->start);
+       for (; cache; cache = next_block_group(fs_info, cache)) {
                if (cache->key.objectid >= (range->start + range->len)) {
                        btrfs_put_block_group(cache);
                        break;
@@ -10865,13 +10936,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
                        if (!block_group_cache_done(cache)) {
                                ret = cache_block_group(cache, 0);
                                if (ret) {
-                                       btrfs_put_block_group(cache);
-                                       break;
+                                       bg_failed++;
+                                       bg_ret = ret;
+                                       continue;
                                }
                                ret = wait_block_group_cache_done(cache);
                                if (ret) {
-                                       btrfs_put_block_group(cache);
-                                       break;
+                                       bg_failed++;
+                                       bg_ret = ret;
+                                       continue;
                                }
                        }
                        ret = btrfs_trim_block_group(cache,
@@ -10882,28 +10955,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 
                        trimmed += group_trimmed;
                        if (ret) {
-                               btrfs_put_block_group(cache);
-                               break;
+                               bg_failed++;
+                               bg_ret = ret;
+                               continue;
                        }
                }
-
-               cache = next_block_group(fs_info, cache);
        }
 
+       if (bg_failed)
+               btrfs_warn(fs_info,
+                       "failed to trim %llu block group(s), last error %d",
+                       bg_failed, bg_ret);
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       devices = &fs_info->fs_devices->alloc_list;
-       list_for_each_entry(device, devices, dev_alloc_list) {
+       devices = &fs_info->fs_devices->devices;
+       list_for_each_entry(device, devices, dev_list) {
                ret = btrfs_trim_free_extents(device, range->minlen,
                                              &group_trimmed);
-               if (ret)
+               if (ret) {
+                       dev_failed++;
+                       dev_ret = ret;
                        break;
+               }
 
                trimmed += group_trimmed;
        }
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
+       if (dev_failed)
+               btrfs_warn(fs_info,
+                       "failed to trim %llu device(s), last error %d",
+                       dev_failed, dev_ret);
        range->len = trimmed;
-       return ret;
+       if (bg_ret)
+               return bg_ret;
+       return dev_ret;
 }
 
 /*