btrfs: move ulist allocation out of transaction in quota enable
[sfrench/cifs-2.6.git] / fs / btrfs / qgroup.c
index 4e473a9982191b0bc0f2fd9036cfdb88fe51194b..eb680b715dd6b2f50e7d98026a8dcc256797eacc 100644 (file)
@@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        if (fs_info->quota_root)
                goto out;
 
+       fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+       if (!fs_info->qgroup_ulist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        /*
         * 1 for quota root item
         * 1 for BTRFS_QGROUP_STATUS item
@@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
                goto out;
        }
 
-       fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
-       if (!fs_info->qgroup_ulist) {
-               ret = -ENOMEM;
-               btrfs_abort_transaction(trans, ret);
-               goto out;
-       }
-
        /*
         * initially create the quota tree
         */
@@ -1546,12 +1545,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
                                 node);
-               if (bytenr < entry->bytenr)
+               if (bytenr < entry->bytenr) {
                        p = &(*p)->rb_left;
-               else if (bytenr > entry->bytenr)
+               } else if (bytenr > entry->bytenr) {
                        p = &(*p)->rb_right;
-               else
+               } else {
+                       if (record->data_rsv && !entry->data_rsv) {
+                               entry->data_rsv = record->data_rsv;
+                               entry->data_rsv_refroot =
+                                       record->data_rsv_refroot;
+                       }
                        return 1;
+               }
        }
 
        rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1602,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
            || bytenr == 0 || num_bytes == 0)
                return 0;
-       record = kmalloc(sizeof(*record), gfp_flag);
+       record = kzalloc(sizeof(*record), gfp_flag);
        if (!record)
                return -ENOMEM;
 
@@ -1832,7 +1837,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
                        src_path->nodes[cur_level] = eb;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
                }
 
@@ -1973,7 +1978,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
                dst_path->slots[cur_level] = 0;
 
                btrfs_tree_read_lock(eb);
-               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+               btrfs_set_lock_blocking_read(eb);
                dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
                need_cleanup = true;
        }
@@ -2017,86 +2022,30 @@ out:
        return ret;
 }
 
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
-                               struct btrfs_block_group_cache *bg_cache,
-                               struct extent_buffer *src_parent, int src_slot,
-                               struct extent_buffer *dst_parent, int dst_slot,
-                               u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+                               struct extent_buffer *src_eb,
+                               struct extent_buffer *dst_eb,
+                               u64 last_snapshot, bool trace_leaf)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *dst_path = NULL;
-       struct btrfs_key first_key;
-       struct extent_buffer *src_eb = NULL;
-       struct extent_buffer *dst_eb = NULL;
-       bool trace_leaf = false;
-       u64 child_gen;
-       u64 child_bytenr;
        int level;
        int ret;
 
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
                return 0;
 
-       /* Check parameter order */
-       if (btrfs_node_ptr_generation(src_parent, src_slot) >
-           btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+       /* Wrong parameter order */
+       if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
                btrfs_err_rl(fs_info,
                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
-                       btrfs_node_ptr_generation(src_parent, src_slot),
-                       btrfs_node_ptr_generation(dst_parent, dst_slot));
+                            btrfs_header_generation(src_eb),
+                            btrfs_header_generation(dst_eb));
                return -EUCLEAN;
        }
 
-       /*
-        * Only trace leaf if we're relocating data block groups, this could
-        * reduce tons of data extents tracing for meta/sys bg relocation.
-        */
-       if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
-               trace_leaf = true;
-       /* Read out real @src_eb, pointed by @src_parent and @src_slot */
-       child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
-       child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
-       btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
-       src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                       btrfs_header_level(src_parent) - 1, &first_key);
-       if (IS_ERR(src_eb)) {
-               ret = PTR_ERR(src_eb);
-               goto out;
-       }
-
-       /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
-       child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
-       child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
-       btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
-       dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                       btrfs_header_level(dst_parent) - 1, &first_key);
-       if (IS_ERR(dst_eb)) {
-               ret = PTR_ERR(dst_eb);
-               goto out;
-       }
-
        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
-               ret = -EINVAL;
+               ret = -EIO;
                goto out;
        }
 
@@ -2106,14 +2055,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
                ret = -ENOMEM;
                goto out;
        }
-
        /* For dst_path */
        extent_buffer_get(dst_eb);
        dst_path->nodes[level] = dst_eb;
        dst_path->slots[level] = 0;
        dst_path->locks[level] = 0;
 
-       /* Do the generation-aware breadth-first search */
+       /* Do the generation aware breadth-first search */
        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
                                              level, last_snapshot, trace_leaf);
        if (ret < 0)
@@ -2121,8 +2069,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
        ret = 0;
 
 out:
-       free_extent_buffer(src_eb);
-       free_extent_buffer(dst_eb);
        btrfs_free_path(dst_path);
        if (ret < 0)
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2153,7 @@ walk_down:
                        path->slots[level] = 0;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
 
                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2576,6 +2522,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
                                        goto cleanup;
                        }
 
+                       /* Free the reserved data space */
+                       btrfs_qgroup_free_refroot(fs_info,
+                                       record->data_rsv_refroot,
+                                       record->data_rsv,
+                                       BTRFS_QGROUP_RSV_DATA);
                        /*
                         * Use SEQ_LAST as time_seq to do special search, which
                         * doesn't lock tree or delayed_refs and search current
@@ -2842,16 +2793,15 @@ out:
 /*
  * Two limits to commit transaction in advance.
  *
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
  * For SIZE, it will be in byte unit as threshold.
  */
-#define QGROUP_PERTRANS_RATIO          32
-#define QGROUP_PERTRANS_SIZE           SZ_32M
+#define QGROUP_FREE_RATIO              32
+#define QGROUP_FREE_SIZE               SZ_32M
 static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
                                const struct btrfs_qgroup *qg, u64 num_bytes)
 {
-       u64 limit;
+       u64 free;
        u64 threshold;
 
        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2870,20 +2820,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
         */
        if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
                              BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
-               if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
-                       limit = qg->max_excl;
-               else
-                       limit = qg->max_rfer;
-               threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
-                           qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
-                           QGROUP_PERTRANS_RATIO;
-               threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+               if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+                       free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+                       threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+                                         QGROUP_FREE_SIZE);
+               } else {
+                       free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+                       threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+                                         QGROUP_FREE_SIZE);
+               }
 
                /*
                 * Use transaction_kthread to commit transaction, so we no
                 * longer need to bother nested transaction nor lock context.
                 */
-               if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+               if (free < threshold)
                        btrfs_commit_transaction_locksafe(fs_info);
        }
 
@@ -2959,7 +2910,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 
                qg = unode_aux_to_qgroup(unode);
 
-               trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
                qgroup_rsv_add(fs_info, qg, num_bytes, type);
        }
 
@@ -3026,7 +2976,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
                qg = unode_aux_to_qgroup(unode);
 
-               trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
                qgroup_rsv_release(fs_info, qg, num_bytes, type);
 
                list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3783,3 +3732,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
        }
        extent_changeset_release(&changeset);
 }
+
+void btrfs_qgroup_init_swapped_blocks(
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+       int i;
+
+       spin_lock_init(&swapped_blocks->lock);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+               swapped_blocks->blocks[i] = RB_ROOT;
+       swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+       int i;
+
+       swapped_blocks = &root->swapped_blocks;
+
+       spin_lock(&swapped_blocks->lock);
+       if (!swapped_blocks->swapped)
+               goto out;
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               struct rb_root *cur_root = &swapped_blocks->blocks[i];
+               struct btrfs_qgroup_swapped_block *entry;
+               struct btrfs_qgroup_swapped_block *next;
+
+               rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+                                                    node)
+                       kfree(entry);
+               swapped_blocks->blocks[i] = RB_ROOT;
+       }
+       swapped_blocks->swapped = false;
+out:
+       spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root:       tree root of the subvolume tree get swapped
+ * @bg:                        block group under balance
+ * @subvol_parent/slot:        pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ *                     BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot:     last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+               struct btrfs_root *subvol_root,
+               struct btrfs_block_group_cache *bg,
+               struct extent_buffer *subvol_parent, int subvol_slot,
+               struct extent_buffer *reloc_parent, int reloc_slot,
+               u64 last_snapshot)
+{
+       struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct rb_node **cur;
+       struct rb_node *parent = NULL;
+       int level = btrfs_header_level(subvol_parent) - 1;
+       int ret = 0;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+
+       if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+           btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+               btrfs_err_rl(fs_info,
+               "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+                       __func__,
+                       btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+                       btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+               return -EUCLEAN;
+       }
+
+       block = kmalloc(sizeof(*block), GFP_NOFS);
+       if (!block) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * @reloc_parent/slot is still before swap, while @block is going to
+        * record the bytenr after swap, so we do the swap here.
+        */
+       block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+       block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+                                                            reloc_slot);
+       block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+       block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+                                                           subvol_slot);
+       block->last_snapshot = last_snapshot;
+       block->level = level;
+       if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+               block->trace_leaf = true;
+       else
+               block->trace_leaf = false;
+       btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+       /* Insert @block into @blocks */
+       spin_lock(&blocks->lock);
+       cur = &blocks->blocks[level].rb_node;
+       while (*cur) {
+               struct btrfs_qgroup_swapped_block *entry;
+
+               parent = *cur;
+               entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+                                node);
+
+               if (entry->subvol_bytenr < block->subvol_bytenr) {
+                       cur = &(*cur)->rb_left;
+               } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+                       cur = &(*cur)->rb_right;
+               } else {
+                       if (entry->subvol_generation !=
+                                       block->subvol_generation ||
+                           entry->reloc_bytenr != block->reloc_bytenr ||
+                           entry->reloc_generation !=
+                                       block->reloc_generation) {
+                               /*
+                                * Duplicated but mismatch entry found.
+                                * Shouldn't happen.
+                                *
+                                * Marking qgroup inconsistent should be enough
+                                * for end users.
+                                */
+                               WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+                               ret = -EEXIST;
+                       }
+                       kfree(block);
+                       goto out_unlock;
+               }
+       }
+       rb_link_node(&block->node, parent, cur);
+       rb_insert_color(&block->node, &blocks->blocks[level]);
+       blocks->swapped = true;
+out_unlock:
+       spin_unlock(&blocks->lock);
+out:
+       if (ret < 0)
+               fs_info->qgroup_flags |=
+                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct extent_buffer *subvol_eb)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct extent_buffer *reloc_eb = NULL;
+       struct rb_node *node;
+       bool found = false;
+       bool swapped = false;
+       int level = btrfs_header_level(subvol_eb);
+       int ret = 0;
+       int i;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+       if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+               return 0;
+
+       spin_lock(&blocks->lock);
+       if (!blocks->swapped) {
+               spin_unlock(&blocks->lock);
+               return 0;
+       }
+       node = blocks->blocks[level].rb_node;
+
+       while (node) {
+               block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+               if (block->subvol_bytenr < subvol_eb->start) {
+                       node = node->rb_left;
+               } else if (block->subvol_bytenr > subvol_eb->start) {
+                       node = node->rb_right;
+               } else {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               spin_unlock(&blocks->lock);
+               goto out;
+       }
+       /* Found one, remove it from @blocks first and update blocks->swapped */
+       rb_erase(&block->node, &blocks->blocks[level]);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+                       swapped = true;
+                       break;
+               }
+       }
+       blocks->swapped = swapped;
+       spin_unlock(&blocks->lock);
+
+       /* Read out reloc subtree root */
+       reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+                                  block->reloc_generation, block->level,
+                                  &block->first_key);
+       if (IS_ERR(reloc_eb)) {
+               ret = PTR_ERR(reloc_eb);
+               reloc_eb = NULL;
+               goto free_out;
+       }
+       if (!extent_buffer_uptodate(reloc_eb)) {
+               ret = -EIO;
+               goto free_out;
+       }
+
+       ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+                       block->last_snapshot, block->trace_leaf);
+free_out:
+       kfree(block);
+       free_extent_buffer(reloc_eb);
+out:
+       if (ret < 0) {
+               btrfs_err_rl(fs_info,
+                            "failed to account subtree at bytenr %llu: %d",
+                            subvol_eb->start, ret);
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       }
+       return ret;
+}