ext4: limit the length of per-inode prealloc list
authorbrookxu <brookxu.cn@gmail.com>
Mon, 17 Aug 2020 07:36:15 +0000 (15:36 +0800)
committerTheodore Ts'o <tytso@mit.edu>
Wed, 19 Aug 2020 16:04:36 +0000 (12:04 -0400)
In the scenario of writing sparse files, the per-inode prealloc list may
be very long, resulting in high overhead for ext4_mb_use_preallocated().
To circumvent this problem, we limit the maximum length of per-inode
prealloc list to 512 and allow users to modify it.

After patching, we observed that the sys ratio of cpu has dropped, and
the system throughput has increased significantly. We created a process
to write the sparse file, and the running time of the process on the
fixed kernel was significantly reduced, as follows:

Running time on unfixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real    0m2.051s
user    0m0.008s
sys     0m2.026s

Running time on fixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real    0m0.471s
user    0m0.004s
sys     0m0.395s

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
13 files changed:
Documentation/admin-guide/ext4.rst
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/move_extent.c
fs/ext4/super.c
fs/ext4/sysfs.c
include/trace/events/ext4.h

index 7fc6a72920c98dae50af3d99cc60db9f876465ac..59bcc4a92602771045004ae2e1ad03fe28a4de19 100644 (file)
@@ -482,6 +482,9 @@ Files in /sys/fs/ext4/<devname>:
         multiple of this tuning parameter if the stripe size is not set in the
         ext4 superblock
 
+  mb_max_inode_prealloc
+        The maximum length of per-inode ext4_prealloc_space list.
+
   mb_max_to_scan
         The maximum number of extents the multiblock allocator will search to
         find the best extent.
index 71b4370a3f91ca0da83dfb7ee94e115ce431be27..523e00d7b39246ba8b0fb70dbcbf7a89f37e2bf1 100644 (file)
@@ -1070,6 +1070,7 @@ struct ext4_inode_info {
        struct timespec64 i_crtime;
 
        /* mballoc */
+       atomic_t i_prealloc_active;
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
 
@@ -1518,6 +1519,7 @@ struct ext4_sb_info {
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
+       unsigned int s_mb_max_inode_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
@@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
 extern int __init ext4_init_mballoc(void);
 extern void ext4_exit_mballoc(void);
 extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
index 0eea09aa0f267a3858062d5b19fda728473aa06a..a0481582187a35c42997caf4aa7ad9a4380fa8f6 100644 (file)
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
         * i_mutex. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
@@ -4266,7 +4266,7 @@ got_allocated_blocks:
                         * not a good idea to call discard here directly,
                         * but otherwise we'd need to call it every free().
                         */
-                       ext4_discard_preallocations(inode);
+                       ext4_discard_preallocations(inode, 0);
                        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                                fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
                        ext4_free_blocks(handle, inode, NULL, newblock,
@@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
        }
 
        down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        ret = ext4_es_remove_extent(inode, punch_start,
                                    EXT_MAX_BLOCKS - punch_start);
@@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
                                     punch_stop - punch_start, SHIFT_LEFT);
@@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
                goto out_stop;
 
        down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
        if (IS_ERR(path)) {
index 7a2720517bbb92554bf2599702eb13e1ee108f76..e608ce3fb5353a977b053e64c09a3de6e9ff60e3 100644 (file)
@@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
                        (atomic_read(&inode->i_writecount) == 1) &&
                        !EXT4_I(inode)->i_reserved_data_blocks) {
                down_write(&EXT4_I(inode)->i_data_sem);
-               ext4_discard_preallocations(inode);
+               ext4_discard_preallocations(inode, 0);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
index 433ca8415c5abc6c5eb490affa9f1d9fd19f1efd..80c9f33800bea2c3911158889edb929f3c86da4c 100644 (file)
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
         * i_mutex. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
index 0b07576af3bf87d47fef28b69205a45c0b4dd75f..77543f98825807b0e33404a0f2160bcc8919eddc 100644 (file)
@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
-               ext4_discard_preallocations(inode);
+               ext4_discard_preallocations(inode, 0);
 }
 
 static int __check_block_validity(struct inode *inode, const char *func,
@@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        if (stop_block > first_block) {
 
                down_write(&EXT4_I(inode)->i_data_sem);
-               ext4_discard_preallocations(inode);
+               ext4_discard_preallocations(inode, 0);
 
                ret = ext4_es_remove_extent(inode, first_block,
                                            stop_block - first_block);
@@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
 
        down_write(&EXT4_I(inode)->i_data_sem);
 
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
index 6e70a63dcca708375ff67cc8a39d5bf0ba3d9ad4..36eca3bc036af3a94348f562d3cd671f85a32e63 100644 (file)
@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
        reset_inode_seed(inode);
        reset_inode_seed(inode_bl);
 
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        err = ext4_mark_inode_dirty(handle, inode);
        if (err < 0) {
index 45ac6088b4ac4e1dbbc0fb248af845e0783f907d..132c118d12e153cfb72ac12a3402174922ec36c8 100644 (file)
@@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
        /*
         * The default group preallocation is 512, which for 4k block
         * sizes translates to 2 megabytes.  However for bigalloc file
@@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
 }
 
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+                                   struct ext4_prealloc_space *pa)
+{
+       struct ext4_inode_info *ei;
+
+       if (pa->pa_deleted) {
+               ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+                            pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+                            pa->pa_len);
+               return;
+       }
+
+       pa->pa_deleted = 1;
+
+       if (pa->pa_type == MB_INODE_PA) {
+               ei = EXT4_I(pa->pa_inode);
+               atomic_dec(&ei->i_prealloc_active);
+       }
+}
+
 static void ext4_mb_pa_callback(struct rcu_head *head)
 {
        struct ext4_prealloc_space *pa;
@@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                return;
        }
 
-       pa->pa_deleted = 1;
+       ext4_mb_mark_pa_deleted(sb, pa);
        spin_unlock(&pa->pa_lock);
 
        grp_blk = pa->pa_pstart;
@@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        spin_lock(pa->pa_obj_lock);
        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
        spin_unlock(pa->pa_obj_lock);
+       atomic_inc(&ei->i_prealloc_active);
 }
 
 /*
@@ -4182,7 +4204,7 @@ repeat:
                }
 
                /* seems this one can be freed ... */
-               pa->pa_deleted = 1;
+               ext4_mb_mark_pa_deleted(sb, pa);
 
                /* we can trust pa_free ... */
                free += pa->pa_free;
@@ -4245,7 +4267,7 @@ out_dbg:
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
 
        mb_debug(sb, "discard preallocation for inode %lu\n",
                 inode->i_ino);
-       trace_ext4_discard_preallocations(inode);
+       trace_ext4_discard_preallocations(inode,
+                       atomic_read(&ei->i_prealloc_active), needed);
 
        INIT_LIST_HEAD(&list);
 
+       if (needed == 0)
+               needed = UINT_MAX;
+
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
-       while (!list_empty(&ei->i_prealloc_list)) {
-               pa = list_entry(ei->i_prealloc_list.next,
+       while (!list_empty(&ei->i_prealloc_list) && needed) {
+               pa = list_entry(ei->i_prealloc_list.prev,
                                struct ext4_prealloc_space, pa_inode_list);
                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
                spin_lock(&pa->pa_lock);
@@ -4288,10 +4314,11 @@ repeat:
 
                }
                if (pa->pa_deleted == 0) {
-                       pa->pa_deleted = 1;
+                       ext4_mb_mark_pa_deleted(sb, pa);
                        spin_unlock(&pa->pa_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        list_add(&pa->u.pa_tmp_list, &list);
+                       needed--;
                        continue;
                }
 
@@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                BUG_ON(pa->pa_type != MB_GROUP_PA);
 
                /* seems this one can be freed ... */
-               pa->pa_deleted = 1;
+               ext4_mb_mark_pa_deleted(sb, pa);
                spin_unlock(&pa->pa_lock);
 
                list_del_rcu(&pa->pa_inode_list);
@@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
        return ;
 }
 
+/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       int count, delta;
+
+       count = atomic_read(&ei->i_prealloc_active);
+       delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+       if (count > sbi->s_mb_max_inode_prealloc + delta) {
+               count -= sbi->s_mb_max_inode_prealloc;
+               ext4_discard_preallocations(inode, count);
+       }
+}
+
 /*
  * release all resource we used in allocation
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
+       struct inode *inode = ac->ac_inode;
+       struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
@@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                                ext4_mb_add_n_trim(ac);
                        }
                }
+
+               if (pa->pa_type == MB_INODE_PA) {
+                       /*
+                        * treat per-inode prealloc list as a lru list, then try
+                        * to trim the least recently used PA.
+                        */
+                       spin_lock(pa->pa_obj_lock);
+                       list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
+                       spin_unlock(pa->pa_obj_lock);
+               }
+
                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_page)
@@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                mutex_unlock(&ac->ac_lg->lg_mutex);
        ext4_mb_collect_stats(ac);
+       ext4_mb_trim_inode_pa(inode);
        return 0;
 }
 
index 6b4d17c2935d6eca8e0cae5d69b8d283ce754175..e75b4749aa1c27cfb1f712bfdbf561801b312d20 100644 (file)
  */
 #define MB_DEFAULT_GROUP_PREALLOC      512
 
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOC  512
 
 struct ext4_free_data {
        /* this links the free block information from sb_info */
index 1ed86fb6c3026ebb2025a4434c01a47720875375..0d601b8228753e2b975d24c9098c5c77a8dbec04 100644 (file)
@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 
 out:
        if (*moved_len) {
-               ext4_discard_preallocations(orig_inode);
-               ext4_discard_preallocations(donor_inode);
+               ext4_discard_preallocations(orig_inode, 0);
+               ext4_discard_preallocations(donor_inode, 0);
        }
 
        ext4_ext_drop_refs(path);
index daa94c7f7271fc32f6c7ada3cf67290e758145f7..13bdddc081e09580a9a34b8223ce401f215d4813 100644 (file)
@@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        inode_set_iversion(&ei->vfs_inode, 1);
        spin_lock_init(&ei->i_raw_lock);
        INIT_LIST_HEAD(&ei->i_prealloc_list);
+       atomic_set(&ei->i_prealloc_active, 0);
        spin_lock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
@@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
 {
        invalidate_inode_buffers(inode);
        clear_inode(inode);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
index 7fee11cc30e791634a681164cdd0855c2797cc85..bfabb799fa451a429f4a52d8d62311c690b567ce 100644 (file)
@@ -218,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(mb_order2_req),
        ATTR_LIST(mb_stream_req),
        ATTR_LIST(mb_group_prealloc),
+       ATTR_LIST(mb_max_inode_prealloc),
        ATTR_LIST(max_writeback_mb_bump),
        ATTR_LIST(extent_max_zeroout_kb),
        ATTR_LIST(trigger_fs_error),
index 8008d2e116b903c9880569969308bb05c8f233b8..4c8b99ec8606d078d16d66fb6b883df81a522e03 100644 (file)
@@ -746,24 +746,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
 );
 
 TRACE_EVENT(ext4_discard_preallocations,
-       TP_PROTO(struct inode *inode),
+       TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
 
-       TP_ARGS(inode),
+       TP_ARGS(inode, len, needed),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
+               __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
+               __field(        unsigned int,   len             )
+               __field(        unsigned int,   needed          )
 
        ),
 
        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
+               __entry->len    = len;
+               __entry->needed = needed;
        ),
 
-       TP_printk("dev %d,%d ino %lu",
+       TP_printk("dev %d,%d ino %lu len: %u needed %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino)
+                 (unsigned long) __entry->ino, __entry->len,
+                 __entry->needed)
 );
 
 TRACE_EVENT(ext4_mb_discard_preallocations,