Btrfs: try to free metadata pages when we free btree blocks
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
index 9b5da2b013e41598f56f849701c78bde34255057..f5e7cae63d80f8828f711833b58600a8bce71df9 100644 (file)
@@ -56,9 +56,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
                                         int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
                                   u64 bytenr, u64 num, int reserve);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, int is_data);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
@@ -618,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
        } else {
                goto out;
        }
+       btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
        btrfs_release_path(root, path);
@@ -760,6 +758,7 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->reada = 1;
+       path->leave_spinning = 1;
        key.objectid = bytenr;
        key.type = BTRFS_EXTENT_ITEM_KEY;
        key.offset = num_bytes;
@@ -767,8 +766,10 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        /* first find the extent item and update its reference count */
        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
                                path, 0, 1);
-       if (ret < 0)
+       if (ret < 0) {
+               btrfs_set_path_blocking(path);
                return ret;
+       }
 
        if (ret > 0) {
                WARN_ON(1);
@@ -791,11 +792,15 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 
        refs = btrfs_extent_refs(l, item);
        btrfs_set_extent_refs(l, item, refs + refs_to_add);
+       btrfs_unlock_up_safe(path, 1);
+
        btrfs_mark_buffer_dirty(path->nodes[0]);
 
        btrfs_release_path(root->fs_info->extent_root, path);
 
        path->reada = 1;
+       path->leave_spinning = 1;
+
        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent,
@@ -926,52 +931,41 @@ again:
        return NULL;
 }
 
-/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
- */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, unsigned long count)
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct list_head *cluster)
 {
-       struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
+       int count = 0;
        int must_insert_reserved = 0;
-       int run_all = count == (unsigned long)-1;
-
-       if (root == root->fs_info->extent_root)
-               root = root->fs_info->tree_root;
 
        delayed_refs = &trans->transaction->delayed_refs;
-again:
-       spin_lock(&delayed_refs->lock);
-       if (count == 0)
-               count = delayed_refs->num_entries;
        while (1) {
                if (!locked_ref) {
-                       /*
-                        * no locked ref, go find something we can
-                        * process in the rbtree.  We start at
-                        * the beginning of the tree, there may be less
-                        * lock contention if we do something smarter here.
-                        */
-                       node = rb_first(&delayed_refs->root);
-                       if (!node) {
-                               spin_unlock(&delayed_refs->lock);
+                       /* pick a new head ref from the cluster list */
+                       if (list_empty(cluster))
                                break;
-                       }
 
-                       ref = rb_entry(node, struct btrfs_delayed_ref_node,
-                                      rb_node);
-                       ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
-                       if (ret) {
-                               spin_unlock(&delayed_refs->lock);
-                               break;
+                       locked_ref = list_entry(cluster->next,
+                                    struct btrfs_delayed_ref_head, cluster);
+
+                       /* grab the lock that says we are going to process
+                        * all the refs for this head */
+                       ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+                       /*
+                        * we may have dropped the spin lock to get the head
+                        * mutex lock, and that might have given someone else
+                        * time to free the head.  If that's true, it has been
+                        * removed from our list and we can move on.
+                        */
+                       if (ret == -EAGAIN) {
+                               locked_ref = NULL;
+                               count++;
+                               continue;
                        }
                }
 
@@ -986,7 +980,6 @@ again:
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
                 */
-
                ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
@@ -994,6 +987,7 @@ again:
                         * so that any accounting fixes can happen
                         */
                        ref = &locked_ref->node;
+                       list_del_init(&locked_ref->cluster);
                        locked_ref = NULL;
                }
 
@@ -1007,29 +1001,72 @@ again:
                BUG_ON(ret);
                btrfs_put_delayed_ref(ref);
 
-               /* once we lock the head ref, we have to process all the
-                * entries for it.  So, we might end up doing more entries
-                * that count was asking us to do.
-                */
-               if (count > 0)
-                       count--;
+               count++;
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+       return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long count)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct list_head cluster;
+       int ret;
+       int run_all = count == (unsigned long)-1;
+       int run_most = 0;
+
+       if (root == root->fs_info->extent_root)
+               root = root->fs_info->tree_root;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       INIT_LIST_HEAD(&cluster);
+again:
+       spin_lock(&delayed_refs->lock);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+       while (1) {
+               if (!(run_all || run_most) &&
+                   delayed_refs->num_heads_ready < 64)
+                       break;
 
                /*
-                * we set locked_ref to null above if we're all done
-                * with this bytenr
+                * go find something we can process in the rbtree.  We start at
+                * the beginning of the tree, and then build a cluster
+                * of refs to process starting at the first one we are able to
+                * lock
                 */
-               if (!locked_ref && count == 0)
+               ret = btrfs_find_ref_cluster(trans, &cluster,
+                                            delayed_refs->run_delayed_start);
+               if (ret)
                        break;
 
-               spin_lock(&delayed_refs->lock);
+               ret = run_clustered_refs(trans, root, &cluster);
+               BUG_ON(ret < 0);
+
+               count -= min_t(unsigned long, ret, count);
+
+               if (count == 0)
+                       break;
        }
+
        if (run_all) {
-               spin_lock(&delayed_refs->lock);
                node = rb_first(&delayed_refs->root);
-               if (!node) {
-                       spin_unlock(&delayed_refs->lock);
+               if (!node)
                        goto out;
-               }
+               count = (unsigned long)-1;
 
                while (node) {
                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
@@ -1045,16 +1082,17 @@ again:
                                mutex_unlock(&head->mutex);
 
                                btrfs_put_delayed_ref(ref);
+                               cond_resched();
                                goto again;
                        }
                        node = rb_next(node);
                }
                spin_unlock(&delayed_refs->lock);
-               count = (unsigned long)-1;
                schedule_timeout(1);
                goto again;
        }
 out:
+       spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
@@ -2017,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
+       mutex_unlock(&root->fs_info->pinned_mutex);
+
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
                BUG_ON(!cache);
@@ -2108,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        u64 end;
        int ret;
 
-       mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
+               mutex_lock(&root->fs_info->pinned_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
                if (ret)
@@ -2117,14 +2157,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                ret = btrfs_discard_extent(root, start, end + 1 - start);
 
+               /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
-               if (need_resched()) {
-                       mutex_unlock(&root->fs_info->pinned_mutex);
-                       cond_resched();
-                       mutex_lock(&root->fs_info->pinned_mutex);
-               }
+               cond_resched();
        }
        mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
@@ -2132,7 +2169,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, int is_data)
+                         struct btrfs_path *path,
+                         u64 bytenr, u64 num_bytes, int is_data,
+                         struct extent_buffer **must_clean)
 {
        int err = 0;
        struct extent_buffer *buf;
@@ -2158,15 +2197,16 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                       clean_tree_block(NULL, root, buf);
-                       btrfs_tree_unlock(buf);
-                       free_extent_buffer(buf);
+                       *must_clean = buf;
                        return 1;
                }
                btrfs_tree_unlock(buf);
        }
        free_extent_buffer(buf);
 pinit:
+       btrfs_set_path_blocking(path);
+       mutex_lock(&root->fs_info->pinned_mutex);
+       /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
        BUG_ON(err < 0);
@@ -2203,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->reada = 1;
+       path->leave_spinning = 1;
        ret = lookup_extent_backref(trans, extent_root, path,
                                    bytenr, parent, root_objectid,
                                    ref_generation, owner_objectid, 1);
@@ -2228,6 +2269,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                                                    refs_to_drop);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2285,6 +2327,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                /* if refs are 0, we need to setup the path for deletion */
                if (refs == 0) {
                        btrfs_release_path(extent_root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root, &key, path,
                                                -1, 1);
                        BUG_ON(ret);
@@ -2294,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        if (refs == 0) {
                u64 super_used;
                u64 root_used;
+               struct extent_buffer *must_clean = NULL;
 
                if (pin) {
-                       mutex_lock(&root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, root, bytenr, num_bytes,
-                               owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
-                       mutex_unlock(&root->fs_info->pinned_mutex);
+                       ret = pin_down_bytes(trans, root, path,
+                               bytenr, num_bytes,
+                               owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+                               &must_clean);
                        if (ret > 0)
                                mark_free = 1;
                        BUG_ON(ret < 0);
                }
+
                /* block accounting for super block */
                spin_lock(&info->delalloc_lock);
                super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2315,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_set_root_used(&root->root_item,
                                           root_used - num_bytes);
                spin_unlock(&info->delalloc_lock);
+
+               /*
+                * it is going to be very rare for someone to be waiting
+                * on the block we're freeing.  del_items might need to
+                * schedule, so rather than get fancy, just force it
+                * to blocking here
+                */
+               if (must_clean)
+                       btrfs_set_lock_blocking(must_clean);
+
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
 
+               if (must_clean) {
+                       clean_tree_block(NULL, root, must_clean);
+                       btrfs_tree_unlock(must_clean);
+                       free_extent_buffer(must_clean);
+               }
+
                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
+               } else {
+                       invalidate_mapping_pages(info->btree_inode->i_mapping,
+                            bytenr >> PAGE_CACHE_SHIFT,
+                            (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2361,6 +2426,74 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                            owner_objectid, pin, pin == 0, refs_to_drop);
 }
 
+/*
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 bytenr)
+{
+       struct btrfs_delayed_ref_head *head;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct rb_node *node;
+       int ret;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (!head)
+               goto out;
+
+       node = rb_prev(&head->node.rb_node);
+       if (!node)
+               goto out;
+
+       ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+       /* there are still entries for this ref, we can't drop it */
+       if (ref->bytenr == bytenr)
+               goto out;
+
+       /*
+        * waiting for the lock here would deadlock.  If someone else has it
+        * locked they are already in the process of dropping it anyway
+        */
+       if (!mutex_trylock(&head->mutex))
+               goto out;
+
+       /*
+        * at this point we have a head with no other entries.  Go
+        * ahead and process it.
+        */
+       head->node.in_tree = 0;
+       rb_erase(&head->node.rb_node, &delayed_refs->root);
+
+       delayed_refs->num_entries--;
+
+       /*
+        * we don't take a ref on the node because we're removing it from the
+        * tree, so we just steal the ref the tree was holding.
+        */
+       delayed_refs->num_heads--;
+       if (list_empty(&head->cluster))
+               delayed_refs->num_heads_ready--;
+
+       list_del_init(&head->cluster);
+       spin_unlock(&delayed_refs->lock);
+
+       ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+                                 &head->node, head->must_insert_reserved);
+       BUG_ON(ret);
+       btrfs_put_delayed_ref(&head->node);
+       return 0;
+out:
+       spin_unlock(&delayed_refs->lock);
+       return 0;
+}
+
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
@@ -2379,8 +2512,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
                mutex_lock(&root->fs_info->pinned_mutex);
+
+               /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-               mutex_unlock(&root->fs_info->pinned_mutex);
                update_reserved_extents(root, bytenr, num_bytes, 0);
                ret = 0;
        } else {
@@ -2388,6 +2522,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                                       root_objectid, ref_generation,
                                       owner_objectid,
                                       BTRFS_DROP_DELAYED_REF, 1);
+               BUG_ON(ret);
+               ret = check_ref_cleanup(trans, root, bytenr);
+               BUG_ON(ret);
        }
        return ret;
 }
@@ -2827,6 +2964,7 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
                                       sizes, 2);
        BUG_ON(ret);
@@ -3638,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_path *path;
        int i;
        int orig_level;
+       int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
 
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -3679,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
        }
        while (1) {
+               unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
@@ -3691,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        break;
                if (wret < 0)
                        ret = wret;
-               if (trans->transaction->in_commit) {
+               if (trans->transaction->in_commit ||
+                   trans->transaction->delayed_refs.flushing) {
                        ret = -EAGAIN;
                        break;
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+               for (update_count = 0; update_count < 16; update_count++) {
+                       update = trans->delayed_ref_updates;
+                       trans->delayed_ref_updates = 0;
+                       if (update)
+                               btrfs_run_delayed_refs(trans, root, update);
+                       else
+                               break;
+               }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
@@ -5320,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                goto out;
@@ -5751,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        extent_root = root->fs_info->extent_root;
 
-       root->fs_info->last_trans_new_blockgroup = trans->transid;
+       root->fs_info->last_trans_log_full_commit = trans->transid;
 
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)