Merge tag 'afs-next-20171113' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowel...
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
index 24cefde30e30b1ffe58284216a64c6f4ecec2096..7208ecef70889833ac2caa7d3d5d8b4b634a4ee0 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
 #include "hash.h"
 #include "tree-log.h"
 #include "disk-io.h"
 #include "hash.h"
 #include "tree-log.h"
 #include "disk-io.h"
@@ -38,6 +39,7 @@
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "ref-verify.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -61,9 +63,6 @@ enum {
        CHUNK_ALLOC_FORCE = 2,
 };
 
        CHUNK_ALLOC_FORCE = 2,
 };
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info, u64 bytenr,
-                             u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_fs_info *fs_info,
                                struct btrfs_delayed_ref_node *node, u64 parent,
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_fs_info *fs_info,
                                struct btrfs_delayed_ref_node *node, u64 parent,
@@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 static void dump_space_info(struct btrfs_fs_info *fs_info,
                            struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
 static void dump_space_info(struct btrfs_fs_info *fs_info,
                            struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                   u64 ram_bytes, u64 num_bytes, int delalloc);
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                    u64 num_bytes, int delalloc);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-                                   struct btrfs_space_info *space_info,
-                                   u64 orig_bytes,
-                                   enum btrfs_reserve_flush_enum flush,
-                                   bool system_chunk);
 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
                                     struct btrfs_space_info *space_info,
                                     u64 num_bytes);
 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
                                     struct btrfs_space_info *space_info,
                                     u64 num_bytes);
@@ -652,7 +642,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        cache->cached = BTRFS_CACHE_FAST;
        spin_unlock(&cache->lock);
 
        cache->cached = BTRFS_CACHE_FAST;
        spin_unlock(&cache->lock);
 
-       if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+       if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
                mutex_lock(&caching_ctl->mutex);
                ret = load_free_space_cache(fs_info, cache);
 
                mutex_lock(&caching_ctl->mutex);
                ret = load_free_space_cache(fs_info, cache);
 
@@ -923,7 +913,7 @@ search_again:
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (head) {
                if (!mutex_trylock(&head->mutex)) {
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (head) {
                if (!mutex_trylock(&head->mutex)) {
-                       refcount_inc(&head->node.refs);
+                       refcount_inc(&head->refs);
                        spin_unlock(&delayed_refs->lock);
 
                        btrfs_release_path(path);
                        spin_unlock(&delayed_refs->lock);
 
                        btrfs_release_path(path);
@@ -934,7 +924,7 @@ search_again:
                         */
                        mutex_lock(&head->mutex);
                        mutex_unlock(&head->mutex);
                         */
                        mutex_lock(&head->mutex);
                        mutex_unlock(&head->mutex);
-                       btrfs_put_delayed_ref(&head->node);
+                       btrfs_put_delayed_ref_head(head);
                        goto search_again;
                }
                spin_lock(&head->lock);
                        goto search_again;
                }
                spin_lock(&head->lock);
@@ -943,7 +933,7 @@ search_again:
                else
                        BUG_ON(num_refs == 0);
 
                else
                        BUG_ON(num_refs == 0);
 
-               num_refs += head->node.ref_mod;
+               num_refs += head->ref_mod;
                spin_unlock(&head->lock);
                mutex_unlock(&head->mutex);
        }
                spin_unlock(&head->lock);
                mutex_unlock(&head->mutex);
        }
@@ -2189,16 +2179,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 
 /* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 /* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                        struct btrfs_fs_info *fs_info,
+                        struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 owner, u64 offset)
 {
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 owner, u64 offset)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int old_ref_mod, new_ref_mod;
        int ret;
 
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
        int old_ref_mod, new_ref_mod;
        int ret;
 
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
+       btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
+                          owner, offset, BTRFS_ADD_DELAYED_REF);
+
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                                 num_bytes, parent,
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                                 num_bytes, parent,
@@ -2344,7 +2338,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 
 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
                                 struct btrfs_fs_info *fs_info,
 
 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
                                 struct btrfs_fs_info *fs_info,
-                                struct btrfs_delayed_ref_node *node,
+                                struct btrfs_delayed_ref_head *head,
                                 struct btrfs_delayed_extent_op *extent_op)
 {
        struct btrfs_key key;
                                 struct btrfs_delayed_extent_op *extent_op)
 {
        struct btrfs_key key;
@@ -2366,14 +2360,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
        if (!path)
                return -ENOMEM;
 
-       key.objectid = node->bytenr;
+       key.objectid = head->bytenr;
 
        if (metadata) {
                key.type = BTRFS_METADATA_ITEM_KEY;
                key.offset = extent_op->level;
        } else {
                key.type = BTRFS_EXTENT_ITEM_KEY;
 
        if (metadata) {
                key.type = BTRFS_METADATA_ITEM_KEY;
                key.offset = extent_op->level;
        } else {
                key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = node->num_bytes;
+               key.offset = head->num_bytes;
        }
 
 again:
        }
 
 again:
@@ -2390,17 +2384,17 @@ again:
                                path->slots[0]--;
                                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                                      path->slots[0]);
                                path->slots[0]--;
                                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                                      path->slots[0]);
-                               if (key.objectid == node->bytenr &&
+                               if (key.objectid == head->bytenr &&
                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
-                                   key.offset == node->num_bytes)
+                                   key.offset == head->num_bytes)
                                        ret = 0;
                        }
                        if (ret > 0) {
                                btrfs_release_path(path);
                                metadata = 0;
 
                                        ret = 0;
                        }
                        if (ret > 0) {
                                btrfs_release_path(path);
                                metadata = 0;
 
-                               key.objectid = node->bytenr;
-                               key.offset = node->num_bytes;
+                               key.objectid = head->bytenr;
+                               key.offset = head->num_bytes;
                                key.type = BTRFS_EXTENT_ITEM_KEY;
                                goto again;
                        }
                                key.type = BTRFS_EXTENT_ITEM_KEY;
                                goto again;
                        }
@@ -2507,44 +2501,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                return 0;
        }
 
                return 0;
        }
 
-       if (btrfs_delayed_ref_is_head(node)) {
-               struct btrfs_delayed_ref_head *head;
-               /*
-                * we've hit the end of the chain and we were supposed
-                * to insert this extent into the tree.  But, it got
-                * deleted before we ever needed to insert it, so all
-                * we have to do is clean up the accounting
-                */
-               BUG_ON(extent_op);
-               head = btrfs_delayed_node_to_head(node);
-               trace_run_delayed_ref_head(fs_info, node, head, node->action);
-
-               if (head->total_ref_mod < 0) {
-                       struct btrfs_block_group_cache *cache;
-
-                       cache = btrfs_lookup_block_group(fs_info, node->bytenr);
-                       ASSERT(cache);
-                       percpu_counter_add(&cache->space_info->total_bytes_pinned,
-                                          -node->num_bytes);
-                       btrfs_put_block_group(cache);
-               }
-
-               if (insert_reserved) {
-                       btrfs_pin_extent(fs_info, node->bytenr,
-                                        node->num_bytes, 1);
-                       if (head->is_data) {
-                               ret = btrfs_del_csums(trans, fs_info,
-                                                     node->bytenr,
-                                                     node->num_bytes);
-                       }
-               }
-
-               /* Also free its reserved qgroup space */
-               btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
-                                             head->qgroup_reserved);
-               return ret;
-       }
-
        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
@@ -2563,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
        struct btrfs_delayed_ref_node *ref;
 
 {
        struct btrfs_delayed_ref_node *ref;
 
-       if (list_empty(&head->ref_list))
+       if (RB_EMPTY_ROOT(&head->ref_tree))
                return NULL;
 
        /*
                return NULL;
 
        /*
@@ -2576,12 +2532,114 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
                return list_first_entry(&head->ref_add_list,
                                struct btrfs_delayed_ref_node, add_list);
 
                return list_first_entry(&head->ref_add_list,
                                struct btrfs_delayed_ref_node, add_list);
 
-       ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
-                              list);
+       ref = rb_entry(rb_first(&head->ref_tree),
+                      struct btrfs_delayed_ref_node, ref_node);
        ASSERT(list_empty(&ref->add_list));
        return ref;
 }
 
        ASSERT(list_empty(&ref->add_list));
        return ref;
 }
 
+static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+                                     struct btrfs_delayed_ref_head *head)
+{
+       spin_lock(&delayed_refs->lock);
+       head->processing = 0;
+       delayed_refs->num_heads_ready++;
+       spin_unlock(&delayed_refs->lock);
+       btrfs_delayed_ref_unlock(head);
+}
+
+static int cleanup_extent_op(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info,
+                            struct btrfs_delayed_ref_head *head)
+{
+       struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+       int ret;
+
+       if (!extent_op)
+               return 0;
+       head->extent_op = NULL;
+       if (head->must_insert_reserved) {
+               btrfs_free_delayed_extent_op(extent_op);
+               return 0;
+       }
+       spin_unlock(&head->lock);
+       ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
+       btrfs_free_delayed_extent_op(extent_op);
+       return ret ? ret : 1;
+}
+
+static int cleanup_ref_head(struct btrfs_trans_handle *trans,
+                           struct btrfs_fs_info *fs_info,
+                           struct btrfs_delayed_ref_head *head)
+{
+       struct btrfs_delayed_ref_root *delayed_refs;
+       int ret;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+
+       ret = cleanup_extent_op(trans, fs_info, head);
+       if (ret < 0) {
+               unselect_delayed_ref_head(delayed_refs, head);
+               btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+               return ret;
+       } else if (ret) {
+               return ret;
+       }
+
+       /*
+        * Need to drop our head ref lock and re-acquire the delayed ref lock
+        * and then re-check to make sure nobody got added.
+        */
+       spin_unlock(&head->lock);
+       spin_lock(&delayed_refs->lock);
+       spin_lock(&head->lock);
+       if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+               spin_unlock(&head->lock);
+               spin_unlock(&delayed_refs->lock);
+               return 1;
+       }
+       delayed_refs->num_heads--;
+       rb_erase(&head->href_node, &delayed_refs->href_root);
+       RB_CLEAR_NODE(&head->href_node);
+       spin_unlock(&delayed_refs->lock);
+       spin_unlock(&head->lock);
+       atomic_dec(&delayed_refs->num_entries);
+
+       trace_run_delayed_ref_head(fs_info, head, 0);
+
+       if (head->total_ref_mod < 0) {
+               struct btrfs_block_group_cache *cache;
+
+               cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+               ASSERT(cache);
+               percpu_counter_add(&cache->space_info->total_bytes_pinned,
+                                  -head->num_bytes);
+               btrfs_put_block_group(cache);
+
+               if (head->is_data) {
+                       spin_lock(&delayed_refs->lock);
+                       delayed_refs->pending_csums -= head->num_bytes;
+                       spin_unlock(&delayed_refs->lock);
+               }
+       }
+
+       if (head->must_insert_reserved) {
+               btrfs_pin_extent(fs_info, head->bytenr,
+                                head->num_bytes, 1);
+               if (head->is_data) {
+                       ret = btrfs_del_csums(trans, fs_info, head->bytenr,
+                                             head->num_bytes);
+               }
+       }
+
+       /* Also free its reserved qgroup space */
+       btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+                                     head->qgroup_reserved);
+       btrfs_delayed_ref_unlock(head);
+       btrfs_put_delayed_ref_head(head);
+       return 0;
+}
+
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@ -2655,11 +2713,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                if (ref && ref->seq &&
                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                        spin_unlock(&locked_ref->lock);
                if (ref && ref->seq &&
                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                        spin_unlock(&locked_ref->lock);
-                       spin_lock(&delayed_refs->lock);
-                       locked_ref->processing = 0;
-                       delayed_refs->num_heads_ready++;
-                       spin_unlock(&delayed_refs->lock);
-                       btrfs_delayed_ref_unlock(locked_ref);
+                       unselect_delayed_ref_head(delayed_refs, locked_ref);
                        locked_ref = NULL;
                        cond_resched();
                        count++;
                        locked_ref = NULL;
                        cond_resched();
                        count++;
@@ -2667,102 +2721,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                }
 
                /*
                }
 
                /*
-                * record the must insert reserved flag before we
-                * drop the spin lock.
+                * We're done processing refs in this ref_head, clean everything
+                * up and move on to the next ref_head.
                 */
                 */
-               must_insert_reserved = locked_ref->must_insert_reserved;
-               locked_ref->must_insert_reserved = 0;
-
-               extent_op = locked_ref->extent_op;
-               locked_ref->extent_op = NULL;
-
                if (!ref) {
                if (!ref) {
-
-
-                       /* All delayed refs have been processed, Go ahead
-                        * and send the head node to run_one_delayed_ref,
-                        * so that any accounting fixes can happen
-                        */
-                       ref = &locked_ref->node;
-
-                       if (extent_op && must_insert_reserved) {
-                               btrfs_free_delayed_extent_op(extent_op);
-                               extent_op = NULL;
-                       }
-
-                       if (extent_op) {
-                               spin_unlock(&locked_ref->lock);
-                               ret = run_delayed_extent_op(trans, fs_info,
-                                                           ref, extent_op);
-                               btrfs_free_delayed_extent_op(extent_op);
-
-                               if (ret) {
-                                       /*
-                                        * Need to reset must_insert_reserved if
-                                        * there was an error so the abort stuff
-                                        * can cleanup the reserved space
-                                        * properly.
-                                        */
-                                       if (must_insert_reserved)
-                                               locked_ref->must_insert_reserved = 1;
-                                       spin_lock(&delayed_refs->lock);
-                                       locked_ref->processing = 0;
-                                       delayed_refs->num_heads_ready++;
-                                       spin_unlock(&delayed_refs->lock);
-                                       btrfs_debug(fs_info,
-                                                   "run_delayed_extent_op returned %d",
-                                                   ret);
-                                       btrfs_delayed_ref_unlock(locked_ref);
-                                       return ret;
-                               }
+                       ret = cleanup_ref_head(trans, fs_info, locked_ref);
+                       if (ret > 0 ) {
+                               /* We dropped our lock, we need to loop. */
+                               ret = 0;
                                continue;
                                continue;
+                       } else if (ret) {
+                               return ret;
                        }
                        }
+                       locked_ref = NULL;
+                       count++;
+                       continue;
+               }
 
 
-                       /*
-                        * Need to drop our head ref lock and re-acquire the
-                        * delayed ref lock and then re-check to make sure
-                        * nobody got added.
-                        */
-                       spin_unlock(&locked_ref->lock);
-                       spin_lock(&delayed_refs->lock);
-                       spin_lock(&locked_ref->lock);
-                       if (!list_empty(&locked_ref->ref_list) ||
-                           locked_ref->extent_op) {
-                               spin_unlock(&locked_ref->lock);
-                               spin_unlock(&delayed_refs->lock);
-                               continue;
-                       }
-                       ref->in_tree = 0;
-                       delayed_refs->num_heads--;
-                       rb_erase(&locked_ref->href_node,
-                                &delayed_refs->href_root);
-                       spin_unlock(&delayed_refs->lock);
-               } else {
-                       actual_count++;
-                       ref->in_tree = 0;
-                       list_del(&ref->list);
-                       if (!list_empty(&ref->add_list))
-                               list_del(&ref->add_list);
+               actual_count++;
+               ref->in_tree = 0;
+               rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+               RB_CLEAR_NODE(&ref->ref_node);
+               if (!list_empty(&ref->add_list))
+                       list_del(&ref->add_list);
+               /*
+                * When we play the delayed ref, also correct the ref_mod on
+                * head
+                */
+               switch (ref->action) {
+               case BTRFS_ADD_DELAYED_REF:
+               case BTRFS_ADD_DELAYED_EXTENT:
+                       locked_ref->ref_mod -= ref->ref_mod;
+                       break;
+               case BTRFS_DROP_DELAYED_REF:
+                       locked_ref->ref_mod += ref->ref_mod;
+                       break;
+               default:
+                       WARN_ON(1);
                }
                atomic_dec(&delayed_refs->num_entries);
 
                }
                atomic_dec(&delayed_refs->num_entries);
 
-               if (!btrfs_delayed_ref_is_head(ref)) {
-                       /*
-                        * when we play the delayed ref, also correct the
-                        * ref_mod on head
-                        */
-                       switch (ref->action) {
-                       case BTRFS_ADD_DELAYED_REF:
-                       case BTRFS_ADD_DELAYED_EXTENT:
-                               locked_ref->node.ref_mod -= ref->ref_mod;
-                               break;
-                       case BTRFS_DROP_DELAYED_REF:
-                               locked_ref->node.ref_mod += ref->ref_mod;
-                               break;
-                       default:
-                               WARN_ON(1);
-                       }
-               }
+               /*
+                * Record the must-insert_reserved flag before we drop the spin
+                * lock.
+                */
+               must_insert_reserved = locked_ref->must_insert_reserved;
+               locked_ref->must_insert_reserved = 0;
+
+               extent_op = locked_ref->extent_op;
+               locked_ref->extent_op = NULL;
                spin_unlock(&locked_ref->lock);
 
                ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
                spin_unlock(&locked_ref->lock);
 
                ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
@@ -2770,33 +2777,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
 
                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
-                       spin_lock(&delayed_refs->lock);
-                       locked_ref->processing = 0;
-                       delayed_refs->num_heads_ready++;
-                       spin_unlock(&delayed_refs->lock);
-                       btrfs_delayed_ref_unlock(locked_ref);
+                       unselect_delayed_ref_head(delayed_refs, locked_ref);
                        btrfs_put_delayed_ref(ref);
                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
                                    ret);
                        return ret;
                }
 
                        btrfs_put_delayed_ref(ref);
                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
                                    ret);
                        return ret;
                }
 
-               /*
-                * If this node is a head, that means all the refs in this head
-                * have been dealt with, and we will pick the next head to deal
-                * with, so we must unlock the head and drop it from the cluster
-                * list before we release it.
-                */
-               if (btrfs_delayed_ref_is_head(ref)) {
-                       if (locked_ref->is_data &&
-                           locked_ref->total_ref_mod < 0) {
-                               spin_lock(&delayed_refs->lock);
-                               delayed_refs->pending_csums -= ref->num_bytes;
-                               spin_unlock(&delayed_refs->lock);
-                       }
-                       btrfs_delayed_ref_unlock(locked_ref);
-                       locked_ref = NULL;
-               }
                btrfs_put_delayed_ref(ref);
                count++;
                cond_resched();
                btrfs_put_delayed_ref(ref);
                count++;
                cond_resched();
@@ -3100,33 +3087,16 @@ again:
                        spin_unlock(&delayed_refs->lock);
                        goto out;
                }
                        spin_unlock(&delayed_refs->lock);
                        goto out;
                }
+               head = rb_entry(node, struct btrfs_delayed_ref_head,
+                               href_node);
+               refcount_inc(&head->refs);
+               spin_unlock(&delayed_refs->lock);
 
 
-               while (node) {
-                       head = rb_entry(node, struct btrfs_delayed_ref_head,
-                                       href_node);
-                       if (btrfs_delayed_ref_is_head(&head->node)) {
-                               struct btrfs_delayed_ref_node *ref;
-
-                               ref = &head->node;
-                               refcount_inc(&ref->refs);
-
-                               spin_unlock(&delayed_refs->lock);
-                               /*
-                                * Mutex was contended, block until it's
-                                * released and try again
-                                */
-                               mutex_lock(&head->mutex);
-                               mutex_unlock(&head->mutex);
+               /* Mutex was contended, block until it's released and retry. */
+               mutex_lock(&head->mutex);
+               mutex_unlock(&head->mutex);
 
 
-                               btrfs_put_delayed_ref(ref);
-                               cond_resched();
-                               goto again;
-                       } else {
-                               WARN_ON(1);
-                       }
-                       node = rb_next(node);
-               }
-               spin_unlock(&delayed_refs->lock);
+               btrfs_put_delayed_ref_head(head);
                cond_resched();
                goto again;
        }
                cond_resched();
                goto again;
        }
@@ -3169,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        struct btrfs_delayed_data_ref *data_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_transaction *cur_trans;
        struct btrfs_delayed_data_ref *data_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_transaction *cur_trans;
+       struct rb_node *node;
        int ret = 0;
 
        cur_trans = root->fs_info->running_transaction;
        int ret = 0;
 
        cur_trans = root->fs_info->running_transaction;
@@ -3184,7 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        }
 
        if (!mutex_trylock(&head->mutex)) {
        }
 
        if (!mutex_trylock(&head->mutex)) {
-               refcount_inc(&head->node.refs);
+               refcount_inc(&head->refs);
                spin_unlock(&delayed_refs->lock);
 
                btrfs_release_path(path);
                spin_unlock(&delayed_refs->lock);
 
                btrfs_release_path(path);
@@ -3195,13 +3166,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
                 */
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
                 */
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
-               btrfs_put_delayed_ref(&head->node);
+               btrfs_put_delayed_ref_head(head);
                return -EAGAIN;
        }
        spin_unlock(&delayed_refs->lock);
 
        spin_lock(&head->lock);
                return -EAGAIN;
        }
        spin_unlock(&delayed_refs->lock);
 
        spin_lock(&head->lock);
-       list_for_each_entry(ref, &head->ref_list, list) {
+       /*
+        * XXX: We should replace this with a proper search function in the
+        * future.
+        */
+       for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
                /* If it's a shared ref we know a cross reference exists */
                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
                        ret = 1;
                /* If it's a shared ref we know a cross reference exists */
                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
                        ret = 1;
@@ -3351,7 +3327,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *,
-                           struct btrfs_fs_info *,
+                           struct btrfs_root *,
                            u64, u64, u64, u64, u64, u64);
 
 
                            u64, u64, u64, u64, u64, u64);
 
 
@@ -3391,7 +3367,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 
                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
                        key.offset -= btrfs_file_extent_offset(buf, fi);
 
                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
                        key.offset -= btrfs_file_extent_offset(buf, fi);
-                       ret = process_func(trans, fs_info, bytenr, num_bytes,
+                       ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
                                           key.offset);
                        if (ret)
                                           parent, ref_root, key.objectid,
                                           key.offset);
                        if (ret)
@@ -3399,7 +3375,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = fs_info->nodesize;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = fs_info->nodesize;
-                       ret = process_func(trans, fs_info, bytenr, num_bytes,
+                       ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, level - 1, 0);
                        if (ret)
                                goto fail;
                                           parent, ref_root, level - 1, 0);
                        if (ret)
                                goto fail;
@@ -4836,7 +4812,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                            u64 orig, bool wait_ordered)
 {
 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                            u64 orig, bool wait_ordered)
 {
-       struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
@@ -4852,8 +4827,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
-       block_rsv = &fs_info->delalloc_block_rsv;
-       space_info = block_rsv->space_info;
+       space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 
        delalloc_bytes = percpu_counter_sum_positive(
                                                &fs_info->delalloc_bytes);
 
        delalloc_bytes = percpu_counter_sum_positive(
                                                &fs_info->delalloc_bytes);
@@ -4912,6 +4886,13 @@ skip_async:
        }
 }
 
        }
 }
 
+struct reserve_ticket {
+       u64 bytes;
+       int error;
+       struct list_head list;
+       wait_queue_head_t wait;
+};
+
 /**
  * maybe_commit_transaction - possibly commit the transaction if its ok to
  * @root - the root we're allocating for
 /**
  * maybe_commit_transaction - possibly commit the transaction if its ok to
  * @root - the root we're allocating for
@@ -4923,18 +4904,29 @@ skip_async:
  * will return -ENOSPC.
  */
 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
  * will return -ENOSPC.
  */
 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_space_info *space_info,
-                                 u64 bytes, int force)
+                                 struct btrfs_space_info *space_info)
 {
 {
+       struct reserve_ticket *ticket = NULL;
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
        struct btrfs_trans_handle *trans;
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
        struct btrfs_trans_handle *trans;
+       u64 bytes;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        if (trans)
                return -EAGAIN;
 
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        if (trans)
                return -EAGAIN;
 
-       if (force)
-               goto commit;
+       spin_lock(&space_info->lock);
+       if (!list_empty(&space_info->priority_tickets))
+               ticket = list_first_entry(&space_info->priority_tickets,
+                                         struct reserve_ticket, list);
+       else if (!list_empty(&space_info->tickets))
+               ticket = list_first_entry(&space_info->tickets,
+                                         struct reserve_ticket, list);
+       bytes = (ticket) ? ticket->bytes : 0;
+       spin_unlock(&space_info->lock);
+
+       if (!bytes)
+               return 0;
 
        /* See if there is enough pinned space to make this reservation */
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
 
        /* See if there is enough pinned space to make this reservation */
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
@@ -4949,8 +4941,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
                return -ENOSPC;
 
        spin_lock(&delayed_rsv->lock);
                return -ENOSPC;
 
        spin_lock(&delayed_rsv->lock);
+       if (delayed_rsv->size > bytes)
+               bytes = 0;
+       else
+               bytes -= delayed_rsv->size;
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
-                                  bytes - delayed_rsv->size) < 0) {
+                                  bytes) < 0) {
                spin_unlock(&delayed_rsv->lock);
                return -ENOSPC;
        }
                spin_unlock(&delayed_rsv->lock);
                return -ENOSPC;
        }
@@ -4964,13 +4960,6 @@ commit:
        return btrfs_commit_transaction(trans);
 }
 
        return btrfs_commit_transaction(trans);
 }
 
-struct reserve_ticket {
-       u64 bytes;
-       int error;
-       struct list_head list;
-       wait_queue_head_t wait;
-};
-
 /*
  * Try to flush some data based on policy set by @state. This is only advisory
  * and may fail for various reasons. The caller is supposed to examine the
 /*
  * Try to flush some data based on policy set by @state. This is only advisory
  * and may fail for various reasons. The caller is supposed to examine the
@@ -5020,8 +5009,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                        ret = 0;
                break;
        case COMMIT_TRANS:
                        ret = 0;
                break;
        case COMMIT_TRANS:
-               ret = may_commit_transaction(fs_info, space_info,
-                                            num_bytes, 0);
+               ret = may_commit_transaction(fs_info, space_info);
                break;
        default:
                ret = -ENOSPC;
                break;
        default:
                ret = -ENOSPC;
@@ -5575,11 +5563,12 @@ again:
        }
 }
 
        }
 }
 
-static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
+       u64 ret;
 
        spin_lock(&block_rsv->lock);
        if (num_bytes == (u64)-1)
 
        spin_lock(&block_rsv->lock);
        if (num_bytes == (u64)-1)
@@ -5594,6 +5583,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
        }
        spin_unlock(&block_rsv->lock);
 
        }
        spin_unlock(&block_rsv->lock);
 
+       ret = num_bytes;
        if (num_bytes > 0) {
                if (dest) {
                        spin_lock(&dest->lock);
        if (num_bytes > 0) {
                if (dest) {
                        spin_lock(&dest->lock);
@@ -5613,6 +5603,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                        space_info_add_old_bytes(fs_info, space_info,
                                                 num_bytes);
        }
                        space_info_add_old_bytes(fs_info, space_info,
                                                 num_bytes);
        }
+       return ret;
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@ -5636,6 +5627,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
        rsv->type = type;
 }
 
        rsv->type = type;
 }
 
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+                                  struct btrfs_block_rsv *rsv,
+                                  unsigned short type)
+{
+       btrfs_init_block_rsv(rsv, type);
+       rsv->space_info = __find_space_info(fs_info,
+                                           BTRFS_BLOCK_GROUP_METADATA);
+}
+
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
                                              unsigned short type)
 {
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
                                              unsigned short type)
 {
@@ -5645,9 +5645,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
        if (!block_rsv)
                return NULL;
 
        if (!block_rsv)
                return NULL;
 
-       btrfs_init_block_rsv(block_rsv, type);
-       block_rsv->space_info = __find_space_info(fs_info,
-                                                 BTRFS_BLOCK_GROUP_METADATA);
+       btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
        return block_rsv;
 }
 
        return block_rsv;
 }
 
@@ -5730,6 +5728,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
 
        return ret;
 }
 
+/**
+ * btrfs_inode_rsv_refill - refill the inode block rsv.
+ * @inode - the inode we are refilling.
+ * @flush - the flusing restriction.
+ *
+ * Essentially the same as btrfs_block_rsv_refill, except it uses the
+ * block_rsv->size as the minimum size.  We'll either refill the missing amount
+ * or return if we already have enough space.  This will also handle the resreve
+ * tracepoint for the reserved amount.
+ */
+int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+                          enum btrfs_reserve_flush_enum flush)
+{
+       struct btrfs_root *root = inode->root;
+       struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+       u64 num_bytes = 0;
+       int ret = -ENOSPC;
+
+       spin_lock(&block_rsv->lock);
+       if (block_rsv->reserved < block_rsv->size)
+               num_bytes = block_rsv->size - block_rsv->reserved;
+       spin_unlock(&block_rsv->lock);
+
+       if (num_bytes == 0)
+               return 0;
+
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+       if (!ret) {
+               block_rsv_add_bytes(block_rsv, num_bytes, 0);
+               trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                             btrfs_ino(inode), num_bytes, 1);
+       }
+       return ret;
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+       u64 released = 0;
+
+       /*
+        * Since we statically set the block_rsv->size we just want to say we
+        * are releasing 0 bytes, and then we'll just get the reservation over
+        * the size free'd.
+        */
+       released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+       if (released > 0)
+               trace_btrfs_space_reservation(fs_info, "delalloc",
+                                             btrfs_ino(inode), released, 0);
+}
+
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes)
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes)
@@ -5801,7 +5859,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        fs_info->global_block_rsv.space_info = space_info;
 
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        fs_info->global_block_rsv.space_info = space_info;
-       fs_info->delalloc_block_rsv.space_info = space_info;
        fs_info->trans_block_rsv.space_info = space_info;
        fs_info->empty_block_rsv.space_info = space_info;
        fs_info->delayed_block_rsv.space_info = space_info;
        fs_info->trans_block_rsv.space_info = space_info;
        fs_info->empty_block_rsv.space_info = space_info;
        fs_info->delayed_block_rsv.space_info = space_info;
@@ -5821,8 +5878,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
                                (u64)-1);
 {
        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
                                (u64)-1);
-       WARN_ON(fs_info->delalloc_block_rsv.size > 0);
-       WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5834,12 +5889,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_fs_info *fs_info)
 {
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_fs_info *fs_info)
 {
-       if (!trans->block_rsv)
+       if (!trans->block_rsv) {
+               ASSERT(!trans->bytes_reserved);
                return;
                return;
+       }
 
        if (!trans->bytes_reserved)
                return;
 
 
        if (!trans->bytes_reserved)
                return;
 
+       ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
        trace_btrfs_space_reservation(fs_info, "transaction",
                                      trans->transid, trans->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, trans->block_rsv,
        trace_btrfs_space_reservation(fs_info, "transaction",
                                      trans->transid, trans->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, trans->block_rsv,
@@ -5961,104 +6019,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 }
 
        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 }
 
-/**
- * drop_outstanding_extent - drop an outstanding extent
- * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're releasing.
- *
- * This is called when we are freeing up an outstanding extent, either called
- * after an error or after an extent is written.  This will return the number of
- * reserved extents that need to be freed.  This must be called with
- * BTRFS_I(inode)->lock held.
- */
-static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
-               u64 num_bytes)
-{
-       unsigned drop_inode_space = 0;
-       unsigned dropped_extents = 0;
-       unsigned num_extents;
-
-       num_extents = count_max_extents(num_bytes);
-       ASSERT(num_extents);
-       ASSERT(inode->outstanding_extents >= num_extents);
-       inode->outstanding_extents -= num_extents;
-
-       if (inode->outstanding_extents == 0 &&
-           test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-                              &inode->runtime_flags))
-               drop_inode_space = 1;
-
-       /*
-        * If we have more or the same amount of outstanding extents than we have
-        * reserved then we need to leave the reserved extents count alone.
-        */
-       if (inode->outstanding_extents >= inode->reserved_extents)
-               return drop_inode_space;
-
-       dropped_extents = inode->reserved_extents - inode->outstanding_extents;
-       inode->reserved_extents -= dropped_extents;
-       return dropped_extents + drop_inode_space;
-}
-
-/**
- * calc_csum_metadata_size - return the amount of metadata space that must be
- *     reserved/freed for the given bytes.
- * @inode: the inode we're manipulating
- * @num_bytes: the number of bytes in question
- * @reserve: 1 if we are reserving space, 0 if we are freeing space
- *
- * This adjusts the number of csum_bytes in the inode and then returns the
- * correct amount of metadata that must either be reserved or freed.  We
- * calculate how many checksums we can fit into one leaf and then divide the
- * number of bytes that will need to be checksumed by this value to figure out
- * how many checksums will be required.  If we are adding bytes then the number
- * may go up and we will return the number of additional bytes that must be
- * reserved.  If it is going down we will return the number of bytes that must
- * be freed.
- *
- * This must be called with BTRFS_I(inode)->lock held.
- */
-static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
-                                  int reserve)
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+                                                struct btrfs_inode *inode)
 {
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       u64 old_csums, num_csums;
-
-       if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
-               return 0;
-
-       old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-       if (reserve)
-               inode->csum_bytes += num_bytes;
-       else
-               inode->csum_bytes -= num_bytes;
-       num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-
-       /* No change, no need to reserve more */
-       if (old_csums == num_csums)
-               return 0;
+       struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+       u64 reserve_size = 0;
+       u64 csum_leaves;
+       unsigned outstanding_extents;
 
 
-       if (reserve)
-               return btrfs_calc_trans_metadata_size(fs_info,
-                                                     num_csums - old_csums);
+       lockdep_assert_held(&inode->lock);
+       outstanding_extents = inode->outstanding_extents;
+       if (outstanding_extents)
+               reserve_size = btrfs_calc_trans_metadata_size(fs_info,
+                                               outstanding_extents + 1);
+       csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+                                                inode->csum_bytes);
+       reserve_size += btrfs_calc_trans_metadata_size(fs_info,
+                                                      csum_leaves);
 
 
-       return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
+       spin_lock(&block_rsv->lock);
+       block_rsv->size = reserve_size;
+       spin_unlock(&block_rsv->lock);
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
        struct btrfs_root *root = inode->root;
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
        struct btrfs_root *root = inode->root;
-       struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
-       u64 to_reserve = 0;
-       u64 csum_bytes;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
        bool delalloc_lock = true;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
        bool delalloc_lock = true;
-       u64 to_free = 0;
-       unsigned dropped;
-       bool release_extra = false;
 
        /* If we are a free space inode we need to not flush since we will be in
         * the middle of a transaction commit.  We also don't need the delalloc
 
        /* If we are a free space inode we need to not flush since we will be in
         * the middle of a transaction commit.  We also don't need the delalloc
@@ -6084,19 +6075,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 
 
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 
+       /* Add our new extents and calculate the new rsv size. */
        spin_lock(&inode->lock);
        nr_extents = count_max_extents(num_bytes);
        spin_lock(&inode->lock);
        nr_extents = count_max_extents(num_bytes);
-       inode->outstanding_extents += nr_extents;
-
-       nr_extents = 0;
-       if (inode->outstanding_extents > inode->reserved_extents)
-               nr_extents += inode->outstanding_extents -
-                       inode->reserved_extents;
-
-       /* We always want to reserve a slot for updating the inode. */
-       to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
-       to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
-       csum_bytes = inode->csum_bytes;
+       btrfs_mod_outstanding_extents(inode, nr_extents);
+       inode->csum_bytes += num_bytes;
+       btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
        spin_unlock(&inode->lock);
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
@@ -6106,92 +6090,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
                        goto out_fail;
        }
 
                        goto out_fail;
        }
 
-       ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
+       ret = btrfs_inode_rsv_refill(inode, flush);
        if (unlikely(ret)) {
                btrfs_qgroup_free_meta(root,
                                       nr_extents * fs_info->nodesize);
                goto out_fail;
        }
 
        if (unlikely(ret)) {
                btrfs_qgroup_free_meta(root,
                                       nr_extents * fs_info->nodesize);
                goto out_fail;
        }
 
-       spin_lock(&inode->lock);
-       if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-                            &inode->runtime_flags)) {
-               to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
-               release_extra = true;
-       }
-       inode->reserved_extents += nr_extents;
-       spin_unlock(&inode->lock);
-
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
-
-       if (to_reserve)
-               trace_btrfs_space_reservation(fs_info, "delalloc",
-                                             btrfs_ino(inode), to_reserve, 1);
-       if (release_extra)
-               btrfs_block_rsv_release(fs_info, block_rsv,
-                               btrfs_calc_trans_metadata_size(fs_info, 1));
        return 0;
 
 out_fail:
        spin_lock(&inode->lock);
        return 0;
 
 out_fail:
        spin_lock(&inode->lock);
-       dropped = drop_outstanding_extent(inode, num_bytes);
-       /*
-        * If the inodes csum_bytes is the same as the original
-        * csum_bytes then we know we haven't raced with any free()ers
-        * so we can just reduce our inodes csum bytes and carry on.
-        */
-       if (inode->csum_bytes == csum_bytes) {
-               calc_csum_metadata_size(inode, num_bytes, 0);
-       } else {
-               u64 orig_csum_bytes = inode->csum_bytes;
-               u64 bytes;
-
-               /*
-                * This is tricky, but first we need to figure out how much we
-                * freed from any free-ers that occurred during this
-                * reservation, so we reset ->csum_bytes to the csum_bytes
-                * before we dropped our lock, and then call the free for the
-                * number of bytes that were freed while we were trying our
-                * reservation.
-                */
-               bytes = csum_bytes - inode->csum_bytes;
-               inode->csum_bytes = csum_bytes;
-               to_free = calc_csum_metadata_size(inode, bytes, 0);
-
-
-               /*
-                * Now we need to see how much we would have freed had we not
-                * been making this reservation and our ->csum_bytes were not
-                * artificially inflated.
-                */
-               inode->csum_bytes = csum_bytes - num_bytes;
-               bytes = csum_bytes - orig_csum_bytes;
-               bytes = calc_csum_metadata_size(inode, bytes, 0);
-
-               /*
-                * Now reset ->csum_bytes to what it should be.  If bytes is
-                * more than to_free then we would have freed more space had we
-                * not had an artificially high ->csum_bytes, so we need to free
-                * the remainder.  If bytes is the same or less then we don't
-                * need to do anything, the other free-ers did the correct
-                * thing.
-                */
-               inode->csum_bytes = orig_csum_bytes - num_bytes;
-               if (bytes > to_free)
-                       to_free = bytes - to_free;
-               else
-                       to_free = 0;
-       }
+       nr_extents = count_max_extents(num_bytes);
+       btrfs_mod_outstanding_extents(inode, -nr_extents);
+       inode->csum_bytes -= num_bytes;
+       btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
        spin_unlock(&inode->lock);
-       if (dropped)
-               to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
 
 
-       if (to_free) {
-               btrfs_block_rsv_release(fs_info, block_rsv, to_free);
-               trace_btrfs_space_reservation(fs_info, "delalloc",
-                                             btrfs_ino(inode), to_free, 0);
-       }
+       btrfs_inode_rsv_release(inode);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return ret;
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return ret;
@@ -6199,36 +6117,55 @@ out_fail:
 
 /**
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
 
 /**
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
- * @inode: the inode to release the reservation for
- * @num_bytes: the number of bytes we're releasing
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
- * reservations.
+ * reservations, or on error for the same reason.
  */
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
  */
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       u64 to_free = 0;
-       unsigned dropped;
 
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
        spin_lock(&inode->lock);
 
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
        spin_lock(&inode->lock);
-       dropped = drop_outstanding_extent(inode, num_bytes);
-
-       if (num_bytes)
-               to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+       inode->csum_bytes -= num_bytes;
+       btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
        spin_unlock(&inode->lock);
-       if (dropped > 0)
-               to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
 
        if (btrfs_is_testing(fs_info))
                return;
 
 
        if (btrfs_is_testing(fs_info))
                return;
 
-       trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
-                                     to_free, 0);
+       btrfs_inode_rsv_release(inode);
+}
+
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add.  Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+       unsigned num_extents;
+
+       spin_lock(&inode->lock);
+       num_extents = count_max_extents(num_bytes);
+       btrfs_mod_outstanding_extents(inode, -num_extents);
+       btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+       spin_unlock(&inode->lock);
+
+       if (btrfs_is_testing(fs_info))
+               return;
 
 
-       btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+       btrfs_inode_rsv_release(inode);
 }
 
 /**
 }
 
 /**
@@ -6275,10 +6212,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  * @inode: inode we're releasing space for
  * @start: start position of the space already reserved
  * @len: the len of the space already reserved
  * @inode: inode we're releasing space for
  * @start: start position of the space already reserved
  * @len: the len of the space already reserved
- *
- * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
- * called in the case that we don't need the metadata AND data reservations
- * anymore.  So if there is an error or we insert an inline extent.
+ * @release_bytes: the len of the space we consumed or didn't use
  *
  * This function will release the metadata space that was not used and will
  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
  *
  * This function will release the metadata space that was not used and will
  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
@@ -6286,7 +6220,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  * Also it will handle the qgroup reserved space.
  */
 void btrfs_delalloc_release_space(struct inode *inode,
  * Also it will handle the qgroup reserved space.
  */
 void btrfs_delalloc_release_space(struct inode *inode,
-                       struct extent_changeset *reserved, u64 start, u64 len)
+                                 struct extent_changeset *reserved,
+                                 u64 start, u64 len)
 {
        btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
        btrfs_free_reserved_data_space(inode, reserved, start, len);
 {
        btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
        btrfs_free_reserved_data_space(inode, reserved, start, len);
@@ -6944,7 +6879,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        BUG_ON(!is_data && refs_to_drop != 1);
 
        if (is_data)
        BUG_ON(!is_data && refs_to_drop != 1);
 
        if (is_data)
-               skinny_metadata = 0;
+               skinny_metadata = false;
 
        ret = lookup_extent_backref(trans, info, path, &iref,
                                    bytenr, num_bytes, parent,
 
        ret = lookup_extent_backref(trans, info, path, &iref,
                                    bytenr, num_bytes, parent,
@@ -7199,7 +7134,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
                goto out_delayed_unlock;
 
        spin_lock(&head->lock);
                goto out_delayed_unlock;
 
        spin_lock(&head->lock);
-       if (!list_empty(&head->ref_list))
+       if (!RB_EMPTY_ROOT(&head->ref_tree))
                goto out;
 
        if (head->extent_op) {
                goto out;
 
        if (head->extent_op) {
@@ -7220,9 +7155,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
         * at this point we have a head with no other entries.  Go
         * ahead and process it.
         */
         * at this point we have a head with no other entries.  Go
         * ahead and process it.
         */
-       head->node.in_tree = 0;
        rb_erase(&head->href_node, &delayed_refs->href_root);
        rb_erase(&head->href_node, &delayed_refs->href_root);
-
+       RB_CLEAR_NODE(&head->href_node);
        atomic_dec(&delayed_refs->num_entries);
 
        /*
        atomic_dec(&delayed_refs->num_entries);
 
        /*
@@ -7241,7 +7175,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
                ret = 1;
 
        mutex_unlock(&head->mutex);
                ret = 1;
 
        mutex_unlock(&head->mutex);
-       btrfs_put_delayed_ref(&head->node);
+       btrfs_put_delayed_ref_head(head);
        return ret;
 out:
        spin_unlock(&head->lock);
        return ret;
 out:
        spin_unlock(&head->lock);
@@ -7263,6 +7197,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                int old_ref_mod, new_ref_mod;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                int old_ref_mod, new_ref_mod;
 
+               btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
+                                  root->root_key.objectid,
+                                  btrfs_header_level(buf), 0,
+                                  BTRFS_DROP_DELAYED_REF);
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
                                                 buf->len, parent,
                                                 root->root_key.objectid,
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
                                                 buf->len, parent,
                                                 root->root_key.objectid,
@@ -7315,16 +7253,21 @@ out:
 
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
-                     struct btrfs_fs_info *fs_info,
+                     struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
                      u64 owner, u64 offset)
 {
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
                      u64 owner, u64 offset)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int old_ref_mod, new_ref_mod;
        int ret;
 
        if (btrfs_is_testing(fs_info))
                return 0;
 
        int old_ref_mod, new_ref_mod;
        int ret;
 
        if (btrfs_is_testing(fs_info))
                return 0;
 
+       if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
+               btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
+                                  root_objectid, owner, offset,
+                                  BTRFS_DROP_DELAYED_REF);
 
        /*
         * tree log blocks never actually go into the extent allocation
 
        /*
         * tree log blocks never actually go into the extent allocation
@@ -8292,17 +8235,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                                    u64 root_objectid, u64 owner,
+                                    struct btrfs_root *root, u64 owner,
                                     u64 offset, u64 ram_bytes,
                                     struct btrfs_key *ins)
 {
                                     u64 offset, u64 ram_bytes,
                                     struct btrfs_key *ins)
 {
-       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
 
        int ret;
 
-       BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+       BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+       btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
+                          root->root_key.objectid, owner, offset,
+                          BTRFS_ADD_DELAYED_EXTENT);
 
        ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
 
        ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
-                                        ins->offset, 0, root_objectid, owner,
+                                        ins->offset, 0,
+                                        root->root_key.objectid, owner,
                                         offset, ram_bytes,
                                         BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
        return ret;
                                         offset, ram_bytes,
                                         BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
        return ret;
@@ -8524,6 +8472,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                extent_op->is_data = false;
                extent_op->level = level;
 
                extent_op->is_data = false;
                extent_op->level = level;
 
+               btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
+                                  root_objectid, level, 0,
+                                  BTRFS_ADD_DELAYED_EXTENT);
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
                                                 ins.offset, parent,
                                                 root_objectid, level,
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
                                                 ins.offset, parent,
                                                 root_objectid, level,
@@ -8880,7 +8831,7 @@ skip:
                                             ret);
                        }
                }
                                             ret);
                        }
                }
-               ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
+               ret = btrfs_free_extent(trans, root, bytenr, blocksize,
                                        parent, root->root_key.objectid,
                                        level - 1, 0);
                if (ret)
                                        parent, root->root_key.objectid,
                                        level - 1, 0);
                if (ret)
@@ -9297,7 +9248,7 @@ out:
         * don't have it in the radix (like when we recover after a power fail
         * or unmount) so we don't leak memory.
         */
         * don't have it in the radix (like when we recover after a power fail
         * or unmount) so we don't leak memory.
         */
-       if (!for_reloc && root_dropped == false)
+       if (!for_reloc && !root_dropped)
                btrfs_add_dead_root(root);
        if (err && err != -EAGAIN)
                btrfs_handle_fs_error(fs_info, err, NULL);
                btrfs_add_dead_root(root);
        if (err && err != -EAGAIN)
                btrfs_handle_fs_error(fs_info, err, NULL);
@@ -9954,9 +9905,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        return 0;
 }
 
        return 0;
 }
 
-static void __link_block_group(struct btrfs_space_info *space_info,
-                              struct btrfs_block_group_cache *cache)
+static void link_block_group(struct btrfs_block_group_cache *cache)
 {
 {
+       struct btrfs_space_info *space_info = cache->space_info;
        int index = get_block_group_index(cache);
        bool first = false;
 
        int index = get_block_group_index(cache);
        bool first = false;
 
@@ -10164,7 +10115,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 
                cache->space_info = space_info;
 
 
                cache->space_info = space_info;
 
-               __link_block_group(space_info, cache);
+               link_block_group(cache);
 
                set_avail_alloc_bits(info, cache->flags);
                if (btrfs_chunk_readonly(info, cache->key.objectid)) {
 
                set_avail_alloc_bits(info, cache->flags);
                if (btrfs_chunk_readonly(info, cache->key.objectid)) {
@@ -10323,7 +10274,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                cache->bytes_super, &cache->space_info);
        update_global_block_rsv(fs_info);
 
                                cache->bytes_super, &cache->space_info);
        update_global_block_rsv(fs_info);
 
-       __link_block_group(cache->space_info, cache);
+       link_block_group(cache);
 
        list_add_tail(&cache->bg_list, &trans->new_bgs);
 
 
        list_add_tail(&cache->bg_list, &trans->new_bgs);
 
@@ -10373,6 +10324,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * remove it.
         */
        free_excluded_extents(fs_info, block_group);
         * remove it.
         */
        free_excluded_extents(fs_info, block_group);
+       btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
+                                 block_group->key.offset);
 
        memcpy(&key, &block_group->key, sizeof(key));
        index = get_block_group_index(block_group);
 
        memcpy(&key, &block_group->key, sizeof(key));
        index = get_block_group_index(block_group);