btrfs: replace btrfs_set_lock_blocking_rw with appropriate helpers
[sfrench/cifs-2.6.git] / fs / btrfs / qgroup.c
index 4e473a9982191b0bc0f2fd9036cfdb88fe51194b..d316df95bec4e11d0230cdbd828b522a601c8aa9 100644 (file)
@@ -1832,7 +1832,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
                        src_path->nodes[cur_level] = eb;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
                }
 
@@ -1973,7 +1973,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
                dst_path->slots[cur_level] = 0;
 
                btrfs_tree_read_lock(eb);
-               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+               btrfs_set_lock_blocking_read(eb);
                dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
                need_cleanup = true;
        }
@@ -2017,86 +2017,30 @@ out:
        return ret;
 }
 
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
-                               struct btrfs_block_group_cache *bg_cache,
-                               struct extent_buffer *src_parent, int src_slot,
-                               struct extent_buffer *dst_parent, int dst_slot,
-                               u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+                               struct extent_buffer *src_eb,
+                               struct extent_buffer *dst_eb,
+                               u64 last_snapshot, bool trace_leaf)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *dst_path = NULL;
-       struct btrfs_key first_key;
-       struct extent_buffer *src_eb = NULL;
-       struct extent_buffer *dst_eb = NULL;
-       bool trace_leaf = false;
-       u64 child_gen;
-       u64 child_bytenr;
        int level;
        int ret;
 
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
                return 0;
 
-       /* Check parameter order */
-       if (btrfs_node_ptr_generation(src_parent, src_slot) >
-           btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+       /* Wrong parameter order */
+       if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
                btrfs_err_rl(fs_info,
                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
-                       btrfs_node_ptr_generation(src_parent, src_slot),
-                       btrfs_node_ptr_generation(dst_parent, dst_slot));
+                            btrfs_header_generation(src_eb),
+                            btrfs_header_generation(dst_eb));
                return -EUCLEAN;
        }
 
-       /*
-        * Only trace leaf if we're relocating data block groups, this could
-        * reduce tons of data extents tracing for meta/sys bg relocation.
-        */
-       if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
-               trace_leaf = true;
-       /* Read out real @src_eb, pointed by @src_parent and @src_slot */
-       child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
-       child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
-       btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
-       src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                       btrfs_header_level(src_parent) - 1, &first_key);
-       if (IS_ERR(src_eb)) {
-               ret = PTR_ERR(src_eb);
-               goto out;
-       }
-
-       /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
-       child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
-       child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
-       btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
-       dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                       btrfs_header_level(dst_parent) - 1, &first_key);
-       if (IS_ERR(dst_eb)) {
-               ret = PTR_ERR(dst_eb);
-               goto out;
-       }
-
        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
-               ret = -EINVAL;
+               ret = -EIO;
                goto out;
        }
 
@@ -2106,14 +2050,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
                ret = -ENOMEM;
                goto out;
        }
-
        /* For dst_path */
        extent_buffer_get(dst_eb);
        dst_path->nodes[level] = dst_eb;
        dst_path->slots[level] = 0;
        dst_path->locks[level] = 0;
 
-       /* Do the generation-aware breadth-first search */
+       /* Do the generation aware breadth-first search */
        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
                                              level, last_snapshot, trace_leaf);
        if (ret < 0)
@@ -2121,8 +2064,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
        ret = 0;
 
 out:
-       free_extent_buffer(src_eb);
-       free_extent_buffer(dst_eb);
        btrfs_free_path(dst_path);
        if (ret < 0)
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2148,7 @@ walk_down:
                        path->slots[level] = 0;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
 
                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -3783,3 +3724,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
        }
        extent_changeset_release(&changeset);
 }
+
+void btrfs_qgroup_init_swapped_blocks(
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+       int i;
+
+       spin_lock_init(&swapped_blocks->lock);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+               swapped_blocks->blocks[i] = RB_ROOT;
+       swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+       int i;
+
+       swapped_blocks = &root->swapped_blocks;
+
+       spin_lock(&swapped_blocks->lock);
+       if (!swapped_blocks->swapped)
+               goto out;
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               struct rb_root *cur_root = &swapped_blocks->blocks[i];
+               struct btrfs_qgroup_swapped_block *entry;
+               struct btrfs_qgroup_swapped_block *next;
+
+               rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+                                                    node)
+                       kfree(entry);
+               swapped_blocks->blocks[i] = RB_ROOT;
+       }
+       swapped_blocks->swapped = false;
+out:
+       spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root:       tree root of the subvolume tree get swapped
+ * @bg:                        block group under balance
+ * @subvol_parent/slot:        pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ *                     BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot:     last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+               struct btrfs_root *subvol_root,
+               struct btrfs_block_group_cache *bg,
+               struct extent_buffer *subvol_parent, int subvol_slot,
+               struct extent_buffer *reloc_parent, int reloc_slot,
+               u64 last_snapshot)
+{
+       struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct rb_node **cur;
+       struct rb_node *parent = NULL;
+       int level = btrfs_header_level(subvol_parent) - 1;
+       int ret = 0;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+
+       if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+           btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+               btrfs_err_rl(fs_info,
+               "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+                       __func__,
+                       btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+                       btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+               return -EUCLEAN;
+       }
+
+       block = kmalloc(sizeof(*block), GFP_NOFS);
+       if (!block) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * @reloc_parent/slot is still before swap, while @block is going to
+        * record the bytenr after swap, so we do the swap here.
+        */
+       block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+       block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+                                                            reloc_slot);
+       block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+       block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+                                                           subvol_slot);
+       block->last_snapshot = last_snapshot;
+       block->level = level;
+       if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+               block->trace_leaf = true;
+       else
+               block->trace_leaf = false;
+       btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+       /* Insert @block into @blocks */
+       spin_lock(&blocks->lock);
+       cur = &blocks->blocks[level].rb_node;
+       while (*cur) {
+               struct btrfs_qgroup_swapped_block *entry;
+
+               parent = *cur;
+               entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+                                node);
+
+               if (entry->subvol_bytenr < block->subvol_bytenr) {
+                       cur = &(*cur)->rb_left;
+               } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+                       cur = &(*cur)->rb_right;
+               } else {
+                       if (entry->subvol_generation !=
+                                       block->subvol_generation ||
+                           entry->reloc_bytenr != block->reloc_bytenr ||
+                           entry->reloc_generation !=
+                                       block->reloc_generation) {
+                               /*
+                                * Duplicated but mismatch entry found.
+                                * Shouldn't happen.
+                                *
+                                * Marking qgroup inconsistent should be enough
+                                * for end users.
+                                */
+                               WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+                               ret = -EEXIST;
+                       }
+                       kfree(block);
+                       goto out_unlock;
+               }
+       }
+       rb_link_node(&block->node, parent, cur);
+       rb_insert_color(&block->node, &blocks->blocks[level]);
+       blocks->swapped = true;
+out_unlock:
+       spin_unlock(&blocks->lock);
+out:
+       if (ret < 0)
+               fs_info->qgroup_flags |=
+                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct extent_buffer *subvol_eb)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct extent_buffer *reloc_eb = NULL;
+       struct rb_node *node;
+       bool found = false;
+       bool swapped = false;
+       int level = btrfs_header_level(subvol_eb);
+       int ret = 0;
+       int i;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+       if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+               return 0;
+
+       spin_lock(&blocks->lock);
+       if (!blocks->swapped) {
+               spin_unlock(&blocks->lock);
+               return 0;
+       }
+       node = blocks->blocks[level].rb_node;
+
+       while (node) {
+               block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+               if (block->subvol_bytenr < subvol_eb->start) {
+                       node = node->rb_left;
+               } else if (block->subvol_bytenr > subvol_eb->start) {
+                       node = node->rb_right;
+               } else {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               spin_unlock(&blocks->lock);
+               goto out;
+       }
+       /* Found one, remove it from @blocks first and update blocks->swapped */
+       rb_erase(&block->node, &blocks->blocks[level]);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+                       swapped = true;
+                       break;
+               }
+       }
+       blocks->swapped = swapped;
+       spin_unlock(&blocks->lock);
+
+       /* Read out reloc subtree root */
+       reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+                                  block->reloc_generation, block->level,
+                                  &block->first_key);
+       if (IS_ERR(reloc_eb)) {
+               ret = PTR_ERR(reloc_eb);
+               reloc_eb = NULL;
+               goto free_out;
+       }
+       if (!extent_buffer_uptodate(reloc_eb)) {
+               ret = -EIO;
+               goto free_out;
+       }
+
+       ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+                       block->last_snapshot, block->trace_leaf);
+free_out:
+       kfree(block);
+       free_extent_buffer(reloc_eb);
+out:
+       if (ret < 0) {
+               btrfs_err_rl(fs_info,
+                            "failed to account subtree at bytenr %llu: %d",
+                            subvol_eb->start, ret);
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       }
+       return ret;
+}