btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgr...
authorQu Wenruo <wqu@suse.com>
Wed, 23 Jan 2019 07:15:12 +0000 (15:15 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 25 Feb 2019 13:13:39 +0000 (14:13 +0100)
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.

Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.

The simplified reproducer will be: (needs a 2G ram VM)

  $ mkfs.btrfs -f $dev
  $ mount $dev $mnt

  $ btrfs subv create $mnt/subv
  $ btrfs quota enable $mnt
  $ btrfs quota rescan -w $mnt
  $ btrfs qgroup limit -e 1G $mnt/subv

  $ for i in $(seq -w  1 8); do
   xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
   echo "file $i written" > /dev/kmsg
    done
  $ sync
  $ btrfs qgroup show -pcre --raw $mnt

The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:

  qgroupid         rfer         excl     max_rfer     max_excl parent  child
  --------         ----         ----     --------     -------- ------  -----
  0/5             16384        16384         none         none ---     ---
  0/256      1073758208   1073758208         none   1073741824 ---     ---

And 1073758208 is larger than
  > 1073741824.

[CAUSE]
It's a bug in btrfs qgroup data reserved space management.

For quota limit, we must ensure that:
  reserved (data + metadata) + rfer/excl <= limit

Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.

One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.

Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.

However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.

The related events will be something like:

  file 1 written
  btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
  btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
  btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
  btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
  btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
  cleanup_ref_head: num_bytes=54947840
  cleanup_ref_head: num_bytes=5636096
  cleanup_ref_head: num_bytes=569344
  cleanup_ref_head: num_bytes=57344
  cleanup_ref_head: num_bytes=8192
  ^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
  file 2 written
  ...
  file 8 written
  cleanup_ref_head: num_bytes=8192
  ...
  btrfs_commit_transaction  <<< the only transaction committed during
the test

When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.

This allows us to write more data beyond qgroup limit.

In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.

[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.

Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/extent-tree.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
include/trace/events/btrfs.h

index cad36c99a483ca8f6c508c32c3ad2ce94289f114..7d2a413df90d57f232c5742652a3d22b902291de 100644 (file)
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
        RB_CLEAR_NODE(&head_ref->href_node);
        head_ref->processing = 0;
        head_ref->total_ref_mod = count_mod;
-       head_ref->qgroup_reserved = 0;
-       head_ref->qgroup_ref_root = 0;
        spin_lock_init(&head_ref->lock);
        mutex_init(&head_ref->mutex);
 
        if (qrecord) {
                if (ref_root && reserved) {
-                       head_ref->qgroup_ref_root = ref_root;
-                       head_ref->qgroup_reserved = reserved;
+                       qrecord->data_rsv = reserved;
+                       qrecord->data_rsv_refroot = ref_root;
                }
-
                qrecord->bytenr = bytenr;
                qrecord->num_bytes = num_bytes;
                qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
        existing = htree_insert(&delayed_refs->href_root,
                                &head_ref->href_node);
        if (existing) {
-               WARN_ON(qrecord && head_ref->qgroup_ref_root
-                       && head_ref->qgroup_reserved
-                       && existing->qgroup_ref_root
-                       && existing->qgroup_reserved);
                update_existing_head_ref(trans, existing, head_ref,
                                         old_ref_mod);
                /*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
            is_fstree(ref_root)) {
-               record = kmalloc(sizeof(*record), GFP_NOFS);
+               record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
                        kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
            is_fstree(ref_root)) {
-               record = kmalloc(sizeof(*record), GFP_NOFS);
+               record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
                        kmem_cache_free(btrfs_delayed_ref_head_cachep,
index d2af974f68a1ac2b00c157f8297c33e451fe8d14..70606da440aa7ff8d9d0065229843a98edc6453d 100644 (file)
@@ -102,17 +102,6 @@ struct btrfs_delayed_ref_head {
         */
        int ref_mod;
 
-       /*
-        * For qgroup reserved space freeing.
-        *
-        * ref_root and reserved will be recorded after
-        * BTRFS_ADD_DELAYED_EXTENT is called.
-        * And will be used to free reserved qgroup space at
-        * run_delayed_refs() time.
-        */
-       u64 qgroup_ref_root;
-       u64 qgroup_reserved;
-
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
index 9f012c2facbe3a08517042098ca348a357812d1e..994f0cc41799304581207b23570d68ed44354f94 100644 (file)
@@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
                }
        }
 
-       /* Also free its reserved qgroup space */
-       btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
-                                     head->qgroup_reserved);
        btrfs_delayed_refs_rsv_release(fs_info, nr_items);
 }
 
index 9a2f8c4c0fb9af69dbcc7d15e112364474da26ca..e618ea9cdf7ee16bb7ae7d9e90a2356f76b2eeba 100644 (file)
@@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
                                 node);
-               if (bytenr < entry->bytenr)
+               if (bytenr < entry->bytenr) {
                        p = &(*p)->rb_left;
-               else if (bytenr > entry->bytenr)
+               } else if (bytenr > entry->bytenr) {
                        p = &(*p)->rb_right;
-               else
+               } else {
+                       if (record->data_rsv && !entry->data_rsv) {
+                               entry->data_rsv = record->data_rsv;
+                               entry->data_rsv_refroot =
+                                       record->data_rsv_refroot;
+                       }
                        return 1;
+               }
        }
 
        rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
            || bytenr == 0 || num_bytes == 0)
                return 0;
-       record = kmalloc(sizeof(*record), gfp_flag);
+       record = kzalloc(sizeof(*record), gfp_flag);
        if (!record)
                return -ENOMEM;
 
@@ -2517,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
                                        goto cleanup;
                        }
 
+                       /* Free the reserved data space */
+                       btrfs_qgroup_free_refroot(fs_info,
+                                       record->data_rsv_refroot,
+                                       record->data_rsv,
+                                       BTRFS_QGROUP_RSV_DATA);
                        /*
                         * Use SEQ_LAST as time_seq to do special search, which
                         * doesn't lock tree or delayed_refs and search current
index 5e93733b78c84fa2dbf0fbde21f85d854f15e438..46ba7bd2961cd1edcbbabb341b3fb350416e929b 100644 (file)
@@ -107,6 +107,17 @@ struct btrfs_qgroup_extent_record {
        struct rb_node node;
        u64 bytenr;
        u64 num_bytes;
+
+       /*
+        * For qgroup reserved data space freeing.
+        *
+        * @data_rsv_refroot and @data_rsv will be recorded after
+        * BTRFS_ADD_DELAYED_EXTENT is called.
+        * And will be used to free reserved qgroup space at
+        * transaction commit time.
+        */
+       u32 data_rsv;           /* reserved data space needs to be freed */
+       u64 data_rsv_refroot;   /* which root the reserved data belongs to */
        struct ulist *old_roots;
 };
 
@@ -326,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
                               u64 ref_root, u64 num_bytes,
                               enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
-                                                u64 ref_root, u64 num_bytes)
-{
-       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
-               return;
-       trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
-       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
-                                 BTRFS_QGROUP_RSV_DATA);
-}
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
index 3f08b652363b2232290638d6a1a8b975a4409c4e..ab1cc33adbac6ddb145ea8d698c1b2084dde0cdc 100644 (file)
@@ -1513,35 +1513,6 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
        TP_ARGS(inode, start, len, reserved, op)
 );
 
-DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
-
-       TP_PROTO(const struct btrfs_fs_info *fs_info,
-                u64 ref_root, u64 reserved),
-
-       TP_ARGS(fs_info, ref_root, reserved),
-
-       TP_STRUCT__entry_btrfs(
-               __field(        u64,            ref_root        )
-               __field(        u64,            reserved        )
-       ),
-
-       TP_fast_assign_btrfs(fs_info,
-               __entry->ref_root       = ref_root;
-               __entry->reserved       = reserved;
-       ),
-
-       TP_printk_btrfs("root=%llu reserved=%llu op=free",
-                 __entry->ref_root, __entry->reserved)
-);
-
-DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
-
-       TP_PROTO(const struct btrfs_fs_info *fs_info,
-                u64 ref_root, u64 reserved),
-
-       TP_ARGS(fs_info, ref_root, reserved)
-);
-
 DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_qgroup_extent_record *rec),