Merge tag 'ext4_for_linus-6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Nov 2023 17:45:14 +0000 (07:45 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Nov 2023 17:45:14 +0000 (07:45 -1000)
Pull ext4 updates from Ted Ts'o:
 "Cleanup ext4's multi-block allocator, including adding some unit
  tests, as well as cleaning how we update the backup superblock after
  online resizes or updating the label or uuid.

  Optimize handling of released data blocks in ext4's commit machinery
  to avoid a potential lock contention on s_md_lock spinlock.

  Fix a number of ext4 bugs:

   - fix race between writepages and remount

   - fix racy may inline data check in dio write

   - add missed brelse in an error path in update_backups

   - fix umask handling when ACL support is disabled

   - fix lost EIO error when a journal commit races with a fsync of the
     blockdev

   - fix potential improper i_size when there is a crash right after an
     O_SYNC direct write.

   - check extent node for validity before potentially using what might
     be an invalid pointer

   - fix potential stale data exposure when writing to an unwritten
     extent and the file system is nearly out of space

   - fix potential accounting error around block reservations when
     writing partial delayed allocation writes to a bigalloc cluster

   - avoid memory allocation failure when tracking partial delayed
     allocation writes to a bigalloc cluster

   - fix various debugging print messages"

* tag 'ext4_for_linus-6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (41 commits)
  ext4: properly sync file size update after O_SYNC direct IO
  ext4: fix racy may inline data check in dio write
  ext4: run mballoc test with different layouts setting
  ext4: add first unit test for ext4_mb_new_blocks_simple in mballoc
  ext4: add some kunit stub for mballoc kunit test
  ext4: call ext4_mb_mark_context in ext4_group_add_blocks()
  ext4: Separate block bitmap and buddy bitmap freeing in ext4_group_add_blocks()
  ext4: call ext4_mb_mark_context in ext4_mb_clear_bb
  ext4: Separate block bitmap and buddy bitmap freeing in ext4_mb_clear_bb()
  ext4: call ext4_mb_mark_context in ext4_mb_mark_diskspace_used
  ext4: extend ext4_mb_mark_context to support allocation under journal
  ext4: call ext4_mb_mark_context in ext4_free_blocks_simple
  ext4: factor out codes to update block bitmap and group descriptor on disk from ext4_mb_mark_bb
  ext4: make state in ext4_mb_mark_bb to be bool
  jbd2: fix potential data lost in recovering journal raced with synchronizing fs bdev
  ext4: apply umask if ACL support is disabled
  ext4: mark buffer new if it is unwritten to avoid stale data exposure
  ext4: move 'ix' sanity check to corrent position
  jbd2: fix printk format type for 'io_block' in do_one_pass()
  jbd2: print io_block if check data block checksum failed when do recovery
  ...

14 files changed:
fs/ext4/acl.h
fs/ext4/balloc.c
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/fast_commit.c
fs/ext4/file.c
fs/ext4/inode.c
fs/ext4/mballoc-test.c [new file with mode: 0644]
fs/ext4/mballoc.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/jbd2/recovery.c

index 0c5a79c3b5d4806f9a76241c7fd1ee5a971b9dba..ef4c19e5f57060b13ab4a2b45b8ffb967eff4fbf 100644 (file)
@@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
+       /* usually, the umask is applied by posix_acl_create(), but if
+          ext4 ACL support is disabled at compile time, we need to do
+          it here, because posix_acl_create() will never be called */
+       inode->i_mode &= ~current_umask();
+
        return 0;
 }
 #endif  /* CONFIG_EXT4_FS_POSIX_ACL */
index 79b20d6ae39ec4205949ae9d1f9332e0835a5829..591fb3f710be72cbed6e0a630ae796860870642a 100644 (file)
@@ -22,6 +22,7 @@
 #include "mballoc.h"
 
 #include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
 
 static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
                                            ext4_group_t block_group);
@@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
        itbl_blk_start = ext4_inode_table(sb, gdp);
        itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
        if (itbl_blk_start <= end && itbl_blk_end >= start) {
-               itbl_blk_start = itbl_blk_start >= start ?
-                       itbl_blk_start : start;
-               itbl_blk_end = itbl_blk_end <= end ?
-                       itbl_blk_end : end;
+               itbl_blk_start = max(itbl_blk_start, start);
+               itbl_blk_end = min(itbl_blk_end, end);
 
                itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
                itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);
@@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh_p;
 
+       KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
+                                  sb, block_group, bh);
+
        if (block_group >= ngroups) {
                ext4_error(sb, "block_group >= groups_count - block_group = %u,"
                           " groups_count = %u", block_group, ngroups);
@@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
        ext4_fsblk_t bitmap_blk;
        int err;
 
+       KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
+                                  sb, block_group, ignore_locked);
+
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);
@@ -563,6 +568,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 {
        struct ext4_group_desc *desc;
 
+       KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
+                                  sb, block_group, bh);
+
        if (!buffer_new(bh))
                return 0;
        desc = ext4_get_group_desc(sb, block_group, NULL);
index 8da5fb680210417da2c42d0368d57610104833e3..f16aa375c02ba8d4d99e450af5378d3c5605f966 100644 (file)
@@ -1504,6 +1504,7 @@ struct ext4_sb_info {
        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
        struct buffer_head * s_sbh;     /* Buffer containing the super block */
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
+       /* Array of bh's for the block group descriptors */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
@@ -1574,7 +1575,7 @@ struct ext4_sb_info {
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        unsigned int s_mb_free_pending;
-       struct list_head s_freed_data_list;     /* List of blocks to be freed
+       struct list_head s_freed_data_list[2];  /* List of blocks to be freed
                                                   after commit completed */
        struct list_head s_discard_list;
        struct work_struct s_discard_work;
@@ -1686,7 +1687,8 @@ struct ext4_sb_info {
 
        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
-        * or EXTENTS flag.
+        * or EXTENTS flag or between writepages ops and changing DELALLOC or
+        * DIOREAD_NOLOCK mount options on remount.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
@@ -2934,7 +2936,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
 extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
-                      int len, int state);
+                           int len, bool state);
 static inline bool ext4_mb_cr_expensive(enum criteria cr)
 {
        return cr >= CR_GOAL_LEN_SLOW;
index 4c4176ee174930712d488164800e79fa590515fa..d5efe076d3d3fbb36a6d4ef5c3ea2e1f904c8366 100644 (file)
@@ -1010,6 +1010,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                ix = curp->p_idx;
        }
 
+       if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+               EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+               return -EFSCORRUPTED;
+       }
+
        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
        BUG_ON(len < 0);
        if (len > 0) {
@@ -1019,11 +1024,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
        }
 
-       if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
-               EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
-               return -EFSCORRUPTED;
-       }
-
        ix->ei_block = cpu_to_le32(logical);
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -6081,13 +6081,13 @@ int ext4_ext_clear_bb(struct inode *inode)
                                for (j = 0; j < path->p_depth; j++) {
 
                                        ext4_mb_mark_bb(inode->i_sb,
-                                                       path[j].p_block, 1, 0);
+                                                       path[j].p_block, 1, false);
                                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                                        0, path[j].p_block, 1, 1);
                                }
                                ext4_free_ext_path(path);
                        }
-                       ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+                       ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                        map.m_lblk, map.m_pblk, map.m_len, 1);
                }
index 6f7de14c0fa86f00906f2ce01fbc4905d22fc2c6..f4b50652f0ccea9fec831a46677958ee855188fc 100644 (file)
@@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei);
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
-                            ext4_lblk_t len);
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+                           ext4_lblk_t len,
+                           struct pending_reservation **prealloc);
 
 int __init ext4_init_es(void)
 {
@@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode)
        spin_unlock(&sbi->s_es_lock);
 }
 
+static inline struct pending_reservation *__alloc_pending(bool nofail)
+{
+       if (!nofail)
+               return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+
+       return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static inline void __free_pending(struct pending_reservation *pr)
+{
+       kmem_cache_free(ext4_pending_cachep, pr);
+}
+
 /*
  * Returns true if we cannot fail to allocate memory for this extent_status
  * entry and cannot reclaim it until its status changes.
@@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 {
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
-       int err1 = 0;
-       int err2 = 0;
+       int err1 = 0, err2 = 0, err3 = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
+       struct pending_reservation *pr = NULL;
+       bool revise_pending = false;
 
        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;
@@ -868,11 +883,17 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 
        ext4_es_insert_extent_check(inode, &newes);
 
+       revise_pending = sbi->s_cluster_ratio > 1 &&
+                        test_opt(inode->i_sb, DELALLOC) &&
+                        (status & (EXTENT_STATUS_WRITTEN |
+                                   EXTENT_STATUS_UNWRITTEN));
 retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
+       if ((err1 || err2 || err3) && revise_pending && !pr)
+               pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);
 
        err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
@@ -897,13 +918,18 @@ retry:
                es2 = NULL;
        }
 
-       if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
-           (status & EXTENT_STATUS_WRITTEN ||
-            status & EXTENT_STATUS_UNWRITTEN))
-               __revise_pending(inode, lblk, len);
+       if (revise_pending) {
+               err3 = __revise_pending(inode, lblk, len, &pr);
+               if (err3 != 0)
+                       goto error;
+               if (pr) {
+                       __free_pending(pr);
+                       pr = NULL;
+               }
+       }
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
-       if (err1 || err2)
+       if (err1 || err2 || err3)
                goto retry;
 
        ext4_es_print_tree(inode);
@@ -1311,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
                                rc->ndelonly--;
                                node = rb_next(&pr->rb_node);
                                rb_erase(&pr->rb_node, &tree->root);
-                               kmem_cache_free(ext4_pending_cachep, pr);
+                               __free_pending(pr);
                                if (!node)
                                        break;
                                pr = rb_entry(node, struct pending_reservation,
@@ -1405,8 +1431,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                        }
                }
                if (count_reserved)
-                       count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
-                                  &orig_es, &rc);
+                       count_rsvd(inode, orig_es.es_lblk + len1,
+                                  orig_es.es_len - len1 - len2, &orig_es, &rc);
                goto out_get_reserved;
        }
 
@@ -1907,11 +1933,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
  *
  * @inode - file containing the cluster
  * @lblk - logical block in the cluster to be added
+ * @prealloc - preallocated pending entry
  *
  * Returns 0 on successful insertion and -ENOMEM on failure.  If the
  * pending reservation is already in the set, returns successfully.
  */
-static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
+                           struct pending_reservation **prealloc)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
@@ -1937,10 +1965,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
                }
        }
 
-       pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
-       if (pr == NULL) {
-               ret = -ENOMEM;
-               goto out;
+       if (likely(*prealloc == NULL)) {
+               pr = __alloc_pending(false);
+               if (!pr) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       } else {
+               pr = *prealloc;
+               *prealloc = NULL;
        }
        pr->lclu = lclu;
 
@@ -1970,7 +2003,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
        if (pr != NULL) {
                tree = &EXT4_I(inode)->i_pending_tree;
                rb_erase(&pr->rb_node, &tree->root);
-               kmem_cache_free(ext4_pending_cachep, pr);
+               __free_pending(pr);
        }
 }
 
@@ -2029,10 +2062,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                  bool allocated)
 {
        struct extent_status newes;
-       int err1 = 0;
-       int err2 = 0;
+       int err1 = 0, err2 = 0, err3 = 0;
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
+       struct pending_reservation *pr = NULL;
 
        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;
@@ -2052,6 +2085,8 @@ retry:
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
+       if ((err1 || err2 || err3) && allocated && !pr)
+               pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);
 
        err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
@@ -2074,11 +2109,18 @@ retry:
                es2 = NULL;
        }
 
-       if (allocated)
-               __insert_pending(inode, lblk);
+       if (allocated) {
+               err3 = __insert_pending(inode, lblk, &pr);
+               if (err3 != 0)
+                       goto error;
+               if (pr) {
+                       __free_pending(pr);
+                       pr = NULL;
+               }
+       }
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
-       if (err1 || err2)
+       if (err1 || err2 || err3)
                goto retry;
 
        ext4_es_print_tree(inode);
@@ -2184,21 +2226,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
  * @inode - file containing the range
  * @lblk - logical block defining the start of range
  * @len  - length of range in blocks
+ * @prealloc - preallocated pending entry
  *
  * Used after a newly allocated extent is added to the extents status tree.
  * Requires that the extents in the range have either written or unwritten
  * status.  Must be called while holding i_es_lock.
  */
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
-                            ext4_lblk_t len)
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+                           ext4_lblk_t len,
+                           struct pending_reservation **prealloc)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t end = lblk + len - 1;
        ext4_lblk_t first, last;
        bool f_del = false, l_del = false;
+       int ret = 0;
 
        if (len == 0)
-               return;
+               return 0;
 
        /*
         * Two cases - block range within single cluster and block range
@@ -2219,7 +2264,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                        f_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                first, lblk - 1);
                if (f_del) {
-                       __insert_pending(inode, first);
+                       ret = __insert_pending(inode, first, prealloc);
+                       if (ret < 0)
+                               goto out;
                } else {
                        last = EXT4_LBLK_CMASK(sbi, end) +
                               sbi->s_cluster_ratio - 1;
@@ -2227,9 +2274,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                                l_del = __es_scan_range(inode,
                                                        &ext4_es_is_delonly,
                                                        end + 1, last);
-                       if (l_del)
-                               __insert_pending(inode, last);
-                       else
+                       if (l_del) {
+                               ret = __insert_pending(inode, last, prealloc);
+                               if (ret < 0)
+                                       goto out;
+                       } else
                                __remove_pending(inode, last);
                }
        } else {
@@ -2237,18 +2286,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                first, lblk - 1);
-               if (f_del)
-                       __insert_pending(inode, first);
-               else
+               if (f_del) {
+                       ret = __insert_pending(inode, first, prealloc);
+                       if (ret < 0)
+                               goto out;
+               } else
                        __remove_pending(inode, first);
 
                last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
                if (last != end)
                        l_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                end + 1, last);
-               if (l_del)
-                       __insert_pending(inode, last);
-               else
+               if (l_del) {
+                       ret = __insert_pending(inode, last, prealloc);
+                       if (ret < 0)
+                               goto out;
+               } else
                        __remove_pending(inode, last);
        }
+out:
+       return ret;
 }
index b06de728b3b6c9e9e56667fd58109b601034f610..87c009e0c59a5d7f352a1f6a437b59c496145fb2 100644 (file)
@@ -1806,7 +1806,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
                         * at the end of the FC replay using our array of
                         * modified inodes.
                         */
-                       ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+                       ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        goto next;
                }
 
@@ -1875,7 +1875,7 @@ ext4_fc_replay_del_range(struct super_block *sb,
                if (ret > 0) {
                        remaining -= ret;
                        cur += ret;
-                       ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+                       ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                } else {
                        remaining -= map.m_len;
                        cur += map.m_len;
@@ -1934,12 +1934,12 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
                                if (!IS_ERR(path)) {
                                        for (j = 0; j < path->p_depth; j++)
                                                ext4_mb_mark_bb(inode->i_sb,
-                                                       path[j].p_block, 1, 1);
+                                                       path[j].p_block, 1, true);
                                        ext4_free_ext_path(path);
                                }
                                cur += ret;
                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
-                                                       map.m_len, 1);
+                                                       map.m_len, true);
                        } else {
                                cur = cur + (map.m_len ? map.m_len : 1);
                        }
index 6830ea3a6c59c6b116f83ee4b3345811f92841ed..0166bb9ca160bdb5196aa9731b2c2daebd3c7116 100644 (file)
@@ -306,80 +306,38 @@ out:
 }
 
 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
-                                          ssize_t written, size_t count)
+                                          ssize_t count)
 {
        handle_t *handle;
-       bool truncate = false;
-       u8 blkbits = inode->i_blkbits;
-       ext4_lblk_t written_blk, end_blk;
-       int ret;
-
-       /*
-        * Note that EXT4_I(inode)->i_disksize can get extended up to
-        * inode->i_size while the I/O was running due to writeback of delalloc
-        * blocks. But, the code in ext4_iomap_alloc() is careful to use
-        * zeroed/unwritten extents if this is possible; thus we won't leave
-        * uninitialized blocks in a file even if we didn't succeed in writing
-        * as much as we intended.
-        */
-       WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
-       if (offset + count <= EXT4_I(inode)->i_disksize) {
-               /*
-                * We need to ensure that the inode is removed from the orphan
-                * list if it has been added prematurely, due to writeback of
-                * delalloc blocks.
-                */
-               if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
-                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
-                       if (IS_ERR(handle)) {
-                               ext4_orphan_del(NULL, inode);
-                               return PTR_ERR(handle);
-                       }
-
-                       ext4_orphan_del(handle, inode);
-                       ext4_journal_stop(handle);
-               }
-
-               return written;
-       }
-
-       if (written < 0)
-               goto truncate;
 
+       lockdep_assert_held_write(&inode->i_rwsem);
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-       if (IS_ERR(handle)) {
-               written = PTR_ERR(handle);
-               goto truncate;
-       }
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
 
-       if (ext4_update_inode_size(inode, offset + written)) {
-               ret = ext4_mark_inode_dirty(handle, inode);
+       if (ext4_update_inode_size(inode, offset + count)) {
+               int ret = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(ret)) {
-                       written = ret;
                        ext4_journal_stop(handle);
-                       goto truncate;
+                       return ret;
                }
        }
 
-       /*
-        * We may need to truncate allocated but not written blocks beyond EOF.
-        */
-       written_blk = ALIGN(offset + written, 1 << blkbits);
-       end_blk = ALIGN(offset + count, 1 << blkbits);
-       if (written_blk < end_blk && ext4_can_truncate(inode))
-               truncate = true;
-
-       /*
-        * Remove the inode from the orphan list if it has been extended and
-        * everything went OK.
-        */
-       if (!truncate && inode->i_nlink)
+       if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);
 
-       if (truncate) {
-truncate:
+       return count;
+}
+
+/*
+ * Clean up the inode after DIO or DAX extending write has completed and the
+ * inode size has been updated using ext4_handle_inode_extension().
+ */
+static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+{
+       lockdep_assert_held_write(&inode->i_rwsem);
+       if (count < 0) {
                ext4_truncate_failed_write(inode);
                /*
                 * If the truncate operation failed early, then the inode may
@@ -388,9 +346,28 @@ truncate:
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
+               return;
        }
+       /*
+        * If i_disksize got extended due to writeback of delalloc blocks while
+        * the DIO was running we could fail to cleanup the orphan list in
+        * ext4_handle_inode_extension(). Do it now.
+        */
+       if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+               handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 
-       return written;
+               if (IS_ERR(handle)) {
+                       /*
+                        * The write has successfully completed. Not much to
+                        * do with the error here so just cleanup the orphan
+                        * list and hope for the best.
+                        */
+                       ext4_orphan_del(NULL, inode);
+                       return;
+               }
+               ext4_orphan_del(handle, inode);
+               ext4_journal_stop(handle);
+       }
 }
 
 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +376,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
        loff_t pos = iocb->ki_pos;
        struct inode *inode = file_inode(iocb->ki_filp);
 
+       if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+               error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
        if (error)
                return error;
-
-       if (size && flags & IOMAP_DIO_UNWRITTEN) {
-               error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
-               if (error < 0)
-                       return error;
-       }
        /*
-        * If we are extending the file, we have to update i_size here before
-        * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
-        * buffered reads could zero out too much from page cache pages. Update
-        * of on-disk size will happen later in ext4_dio_write_iter() where
-        * we have enough information to also perform orphan list handling etc.
-        * Note that we perform all extending writes synchronously under
-        * i_rwsem held exclusively so i_size update is safe here in that case.
-        * If the write was not extending, we cannot see pos > i_size here
-        * because operations reducing i_size like truncate wait for all
-        * outstanding DIO before updating i_size.
+        * Note that EXT4_I(inode)->i_disksize can get extended up to
+        * inode->i_size while the I/O was running due to writeback of delalloc
+        * blocks. But the code in ext4_iomap_alloc() is careful to use
+        * zeroed/unwritten extents if this is possible; thus we won't leave
+        * uninitialized blocks in a file even if we didn't succeed in writing
+        * as much as we intended.
         */
-       pos += size;
-       if (pos > i_size_read(inode))
-               i_size_write(inode, pos);
-
-       return 0;
+       WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize));
+       if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize))
+               return size;
+       return ext4_handle_inode_extension(inode, pos, size);
 }
 
 static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -569,18 +537,20 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
                return ext4_buffered_write_iter(iocb, from);
        }
 
+       /*
+        * Prevent inline data from being created since we are going to allocate
+        * blocks for DIO. We know the inode does not currently have inline data
+        * because ext4_should_use_dio() checked for it, but we have to clear
+        * the state flag before the write checks because a lock cycle could
+        * introduce races with other writers.
+        */
+       ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
        ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
                                    &unwritten, &dio_flags);
        if (ret <= 0)
                return ret;
 
-       /*
-        * Make sure inline data cannot be created anymore since we are going
-        * to allocate blocks for DIO. We know the inode does not have any
-        * inline data now because ext4_dio_supported() checked for that.
-        */
-       ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-
        offset = iocb->ki_pos;
        count = ret;
 
@@ -606,9 +576,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
                           dio_flags, NULL, 0);
        if (ret == -ENOTBLK)
                ret = 0;
-
-       if (extend)
-               ret = ext4_handle_inode_extension(inode, offset, ret, count);
+       if (extend) {
+               /*
+                * We always perform extending DIO write synchronously so by
+                * now the IO is completed and ext4_handle_inode_extension()
+                * was called. Cleanup the inode in case of error or race with
+                * writeback of delalloc blocks.
+                */
+               WARN_ON_ONCE(ret == -EIOCBQUEUED);
+               ext4_inode_extension_cleanup(inode, ret);
+       }
 
 out:
        if (ilock_shared)
@@ -689,8 +666,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
        ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 
-       if (extend)
-               ret = ext4_handle_inode_extension(inode, offset, ret, count);
+       if (extend) {
+               ret = ext4_handle_inode_extension(inode, offset, ret);
+               ext4_inode_extension_cleanup(inode, ret);
+       }
 out:
        inode_unlock(inode);
        if (ret > 0)
index 08cb5c0e0d516b74b0bd448b4d2e00e3fa01cd58..a6838f54ae91698b7fce3e6ecf948a299097f66b 100644 (file)
@@ -789,10 +789,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create)
 {
+       int ret = 0;
+
        ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-       return _ext4_get_block(inode, iblock, bh_result,
+       ret = _ext4_get_block(inode, iblock, bh_result,
                               EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+
+       /*
+        * If the buffer is marked unwritten, mark it as new to make sure it is
+        * zeroed out correctly in case of partial writes. Otherwise, there is
+        * a chance of stale data getting exposed.
+        */
+       if (ret == 0 && buffer_unwritten(bh_result))
+               set_buffer_new(bh_result);
+
+       return ret;
 }
 
 /* Maximum number of blocks we map for direct IO at once. */
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
new file mode 100644 (file)
index 0000000..f94901f
--- /dev/null
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 multiblocks allocation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+
+#include "ext4.h"
+
+struct mbt_grp_ctx {
+       struct buffer_head bitmap_bh;
+       /* desc and gd_bh are just the place holders for now */
+       struct ext4_group_desc desc;
+       struct buffer_head gd_bh;
+};
+
+struct mbt_ctx {
+       struct mbt_grp_ctx *grp_ctx;
+};
+
+struct mbt_ext4_super_block {
+       struct super_block sb;
+       struct mbt_ctx mbt_ctx;
+};
+
+#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx))
+#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+
+static struct super_block *mbt_ext4_alloc_super_block(void)
+{
+       struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL);
+       struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+       struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+
+       if (fsb == NULL || sbi == NULL || es == NULL)
+               goto out;
+
+       sbi->s_es = es;
+       fsb->sb.s_fs_info = sbi;
+       return &fsb->sb;
+
+out:
+       kfree(fsb);
+       kfree(sbi);
+       kfree(es);
+       return NULL;
+}
+
+static void mbt_ext4_free_super_block(struct super_block *sb)
+{
+       struct mbt_ext4_super_block *fsb =
+               container_of(sb, struct mbt_ext4_super_block, sb);
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       kfree(sbi->s_es);
+       kfree(sbi);
+       kfree(fsb);
+}
+
+struct mbt_ext4_block_layout {
+       unsigned char blocksize_bits;
+       unsigned int cluster_bits;
+       uint32_t blocks_per_group;
+       ext4_group_t group_count;
+       uint16_t desc_size;
+};
+
+static void mbt_init_sb_layout(struct super_block *sb,
+                              struct mbt_ext4_block_layout *layout)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_super_block *es = sbi->s_es;
+
+       sb->s_blocksize = 1UL << layout->blocksize_bits;
+       sb->s_blocksize_bits = layout->blocksize_bits;
+
+       sbi->s_groups_count = layout->group_count;
+       sbi->s_blocks_per_group = layout->blocks_per_group;
+       sbi->s_cluster_bits = layout->cluster_bits;
+       sbi->s_cluster_ratio = 1U << layout->cluster_bits;
+       sbi->s_clusters_per_group = layout->blocks_per_group >>
+                                   layout->cluster_bits;
+       sbi->s_desc_size = layout->desc_size;
+
+       es->s_first_data_block = cpu_to_le32(0);
+       es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
+                                           layout->group_count);
+}
+
+static int mbt_grp_ctx_init(struct super_block *sb,
+                           struct mbt_grp_ctx *grp_ctx)
+{
+       grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
+       if (grp_ctx->bitmap_bh.b_data == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx)
+{
+       kfree(grp_ctx->bitmap_bh.b_data);
+       grp_ctx->bitmap_bh.b_data = NULL;
+}
+
+static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
+                             unsigned int start, unsigned int len)
+{
+       struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+       mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
+}
+
+/* called after mbt_init_sb_layout */
+static int mbt_ctx_init(struct super_block *sb)
+{
+       struct mbt_ctx *ctx = MBT_CTX(sb);
+       ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+       ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx),
+                              GFP_KERNEL);
+       if (ctx->grp_ctx == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < ngroups; i++)
+               if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i]))
+                       goto out;
+
+       /*
+        * first data block(first cluster in first group) is used by
+        * metadata, mark it used to avoid to alloc data block at first
+        * block which will fail ext4_sb_block_valid check.
+        */
+       mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+
+       return 0;
+out:
+       while (i-- > 0)
+               mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+       kfree(ctx->grp_ctx);
+       return -ENOMEM;
+}
+
+static void mbt_ctx_release(struct super_block *sb)
+{
+       struct mbt_ctx *ctx = MBT_CTX(sb);
+       ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+       for (i = 0; i < ngroups; i++)
+               mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+       kfree(ctx->grp_ctx);
+}
+
+static struct buffer_head *
+ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group,
+                                  bool ignore_locked)
+{
+       struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+       /* paired with brelse from caller of ext4_read_block_bitmap_nowait */
+       get_bh(&grp_ctx->bitmap_bh);
+       return &grp_ctx->bitmap_bh;
+}
+
+static int ext4_wait_block_bitmap_stub(struct super_block *sb,
+                                      ext4_group_t block_group,
+                                      struct buffer_head *bh)
+{
+       return 0;
+}
+
+static struct ext4_group_desc *
+ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group,
+                        struct buffer_head **bh)
+{
+       struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+       if (bh != NULL)
+               *bh = &grp_ctx->gd_bh;
+
+       return &grp_ctx->desc;
+}
+
+static int
+ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
+                         ext4_group_t group, ext4_grpblk_t blkoff,
+                         ext4_grpblk_t len, int flags,
+                         ext4_grpblk_t *ret_changed)
+{
+       struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+       struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh;
+
+       if (state)
+               mb_set_bits(bitmap_bh->b_data, blkoff, len);
+       else
+               mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+
+       return 0;
+}
+
+#define TEST_GOAL_GROUP 1
+static int mbt_kunit_init(struct kunit *test)
+{
+       struct mbt_ext4_block_layout *layout =
+               (struct mbt_ext4_block_layout *)(test->param_value);
+       struct super_block *sb;
+       int ret;
+
+       sb = mbt_ext4_alloc_super_block();
+       if (sb == NULL)
+               return -ENOMEM;
+
+       mbt_init_sb_layout(sb, layout);
+
+       ret = mbt_ctx_init(sb);
+       if (ret != 0) {
+               mbt_ext4_free_super_block(sb);
+               return ret;
+       }
+
+       test->priv = sb;
+       kunit_activate_static_stub(test,
+                                  ext4_read_block_bitmap_nowait,
+                                  ext4_read_block_bitmap_nowait_stub);
+       kunit_activate_static_stub(test,
+                                  ext4_wait_block_bitmap,
+                                  ext4_wait_block_bitmap_stub);
+       kunit_activate_static_stub(test,
+                                  ext4_get_group_desc,
+                                  ext4_get_group_desc_stub);
+       kunit_activate_static_stub(test,
+                                  ext4_mb_mark_context,
+                                  ext4_mb_mark_context_stub);
+       return 0;
+}
+
+static void mbt_kunit_exit(struct kunit *test)
+{
+       struct super_block *sb = (struct super_block *)test->priv;
+
+       mbt_ctx_release(sb);
+       mbt_ext4_free_super_block(sb);
+}
+
+static void test_new_blocks_simple(struct kunit *test)
+{
+       struct super_block *sb = (struct super_block *)test->priv;
+       struct inode inode = { .i_sb = sb, };
+       struct ext4_allocation_request ar;
+       ext4_group_t i, goal_group = TEST_GOAL_GROUP;
+       int err = 0;
+       ext4_fsblk_t found;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       ar.inode = &inode;
+
+       /* get block at goal */
+       ar.goal = ext4_group_first_block_no(sb, goal_group);
+       found = ext4_mb_new_blocks_simple(&ar, &err);
+       KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
+               "failed to alloc block at goal, expected %llu found %llu",
+               ar.goal, found);
+
+       /* get block after goal in goal group */
+       ar.goal = ext4_group_first_block_no(sb, goal_group);
+       found = ext4_mb_new_blocks_simple(&ar, &err);
+       KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
+               "failed to alloc block after goal in goal group, expected %llu found %llu",
+               ar.goal + 1, found);
+
+       /* get block after goal group */
+       mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+       ar.goal = ext4_group_first_block_no(sb, goal_group);
+       found = ext4_mb_new_blocks_simple(&ar, &err);
+       KUNIT_ASSERT_EQ_MSG(test,
+               ext4_group_first_block_no(sb, goal_group + 1), found,
+               "failed to alloc block after goal group, expected %llu found %llu",
+               ext4_group_first_block_no(sb, goal_group + 1), found);
+
+       /* get block before goal group */
+       for (i = goal_group; i < ext4_get_groups_count(sb); i++)
+               mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+       ar.goal = ext4_group_first_block_no(sb, goal_group);
+       found = ext4_mb_new_blocks_simple(&ar, &err);
+       KUNIT_ASSERT_EQ_MSG(test,
+               ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
+               "failed to alloc block before goal group, expected %llu found %llu",
+               ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found);
+
+       /* no block available, fail to allocate block */
+       for (i = 0; i < ext4_get_groups_count(sb); i++)
+               mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+       ar.goal = ext4_group_first_block_no(sb, goal_group);
+       found = ext4_mb_new_blocks_simple(&ar, &err);
+       KUNIT_ASSERT_NE_MSG(test, err, 0,
+               "unexpectedly get block when no block is available");
+}
+
+static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
+       {
+               .blocksize_bits = 10,
+               .cluster_bits = 3,
+               .blocks_per_group = 8192,
+               .group_count = 4,
+               .desc_size = 64,
+       },
+       {
+               .blocksize_bits = 12,
+               .cluster_bits = 3,
+               .blocks_per_group = 8192,
+               .group_count = 4,
+               .desc_size = 64,
+       },
+       {
+               .blocksize_bits = 16,
+               .cluster_bits = 3,
+               .blocks_per_group = 8192,
+               .group_count = 4,
+               .desc_size = 64,
+       },
+};
+
+static void mbt_show_layout(const struct mbt_ext4_block_layout *layout,
+                           char *desc)
+{
+       snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d "
+                "blocks_per_group=%d group_count=%d desc_size=%d\n",
+                layout->blocksize_bits, layout->cluster_bits,
+                layout->blocks_per_group, layout->group_count,
+                layout->desc_size);
+}
+KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
+
+static struct kunit_case mbt_test_cases[] = {
+       KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+       {}
+};
+
+static struct kunit_suite mbt_test_suite = {
+       .name = "ext4_mballoc_test",
+       .init = mbt_kunit_init,
+       .exit = mbt_kunit_exit,
+       .test_cases = mbt_test_cases,
+};
+
+kunit_test_suites(&mbt_test_suite);
+
+MODULE_LICENSE("GPL");
index 1e599305d85fa2b0a4356c2ac9cd4eef905ab456..454d5612641ee3c32e71114e4d6148c52b484459 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
 #include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
 
 /*
  * MUSTDO:
@@ -417,8 +418,6 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
-                                               ext4_group_t group);
 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
 
 static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
@@ -1361,17 +1360,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
                 * We place the buddy block and bitmap block
                 * close together
                 */
+               grinfo = ext4_get_group_info(sb, group);
+               if (!grinfo) {
+                       err = -EFSCORRUPTED;
+                       goto out;
+               }
                if ((first_block + i) & 1) {
                        /* this is block of buddy */
                        BUG_ON(incore == NULL);
                        mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
                        trace_ext4_mb_buddy_bitmap_load(sb, group);
-                       grinfo = ext4_get_group_info(sb, group);
-                       if (!grinfo) {
-                               err = -EFSCORRUPTED;
-                               goto out;
-                       }
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
                               sizeof(*grinfo->bb_counters) *
@@ -1398,7 +1397,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 
                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
-                       ext4_mb_generate_from_freelist(sb, data, group);
+                       WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
                        ext4_unlock_group(sb, group);
 
                        /* set incore so that the buddy information can be
@@ -3631,7 +3630,8 @@ int ext4_mb_init(struct super_block *sb)
 
        spin_lock_init(&sbi->s_md_lock);
        sbi->s_mb_free_pending = 0;
-       INIT_LIST_HEAD(&sbi->s_freed_data_list);
+       INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
+       INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
        INIT_LIST_HEAD(&sbi->s_discard_list);
        INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
        atomic_set(&sbi->s_retry_alloc_pending, 0);
@@ -3883,19 +3883,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_free_data *entry, *tmp;
        LIST_HEAD(freed_data_list);
-       struct list_head *cut_pos = NULL;
+       struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
        bool wake;
 
-       spin_lock(&sbi->s_md_lock);
-       list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
-               if (entry->efd_tid != commit_tid)
-                       break;
-               cut_pos = &entry->efd_list;
-       }
-       if (cut_pos)
-               list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
-                                 cut_pos);
-       spin_unlock(&sbi->s_md_lock);
+       list_replace_init(s_freed_head, &freed_data_list);
 
        list_for_each_entry(entry, &freed_data_list, efd_list)
                ext4_free_data_in_buddy(sb, entry);
@@ -3953,6 +3944,111 @@ void ext4_exit_mballoc(void)
        ext4_groupinfo_destroy_slabs();
 }
 
+#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
+#define EXT4_MB_SYNC_UPDATE 0x0002
+static int
+ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
+                    ext4_group_t group, ext4_grpblk_t blkoff,
+                    ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct buffer_head *bitmap_bh = NULL;
+       struct ext4_group_desc *gdp;
+       struct buffer_head *gdp_bh;
+       int err;
+       unsigned int i, already, changed = len;
+
+       KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
+                                  handle, sb, state, group, blkoff, len,
+                                  flags, ret_changed);
+
+       if (ret_changed)
+               *ret_changed = 0;
+       bitmap_bh = ext4_read_block_bitmap(sb, group);
+       if (IS_ERR(bitmap_bh))
+               return PTR_ERR(bitmap_bh);
+
+       if (handle) {
+               BUFFER_TRACE(bitmap_bh, "getting write access");
+               err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+                                                   EXT4_JTR_NONE);
+               if (err)
+                       goto out_err;
+       }
+
+       err = -EIO;
+       gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+       if (!gdp)
+               goto out_err;
+
+       if (handle) {
+               BUFFER_TRACE(gdp_bh, "get_write_access");
+               err = ext4_journal_get_write_access(handle, sb, gdp_bh,
+                                                   EXT4_JTR_NONE);
+               if (err)
+                       goto out_err;
+       }
+
+       ext4_lock_group(sb, group);
+       if (ext4_has_group_desc_csum(sb) &&
+           (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+               gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+               ext4_free_group_clusters_set(sb, gdp,
+                       ext4_free_clusters_after_init(sb, group, gdp));
+       }
+
+       if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
+               already = 0;
+               for (i = 0; i < len; i++)
+                       if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+                                       state)
+                               already++;
+               changed = len - already;
+       }
+
+       if (state) {
+               mb_set_bits(bitmap_bh->b_data, blkoff, len);
+               ext4_free_group_clusters_set(sb, gdp,
+                       ext4_free_group_clusters(sb, gdp) - changed);
+       } else {
+               mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+               ext4_free_group_clusters_set(sb, gdp,
+                       ext4_free_group_clusters(sb, gdp) + changed);
+       }
+
+       ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+       ext4_group_desc_csum_set(sb, group, gdp);
+       ext4_unlock_group(sb, group);
+       if (ret_changed)
+               *ret_changed = changed;
+
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi, group);
+               struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+                                          s_flex_groups, flex_group);
+
+               if (state)
+                       atomic64_sub(changed, &fg->free_clusters);
+               else
+                       atomic64_add(changed, &fg->free_clusters);
+       }
+
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+       if (err)
+               goto out_err;
+       err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+       if (err)
+               goto out_err;
+
+       if (flags & EXT4_MB_SYNC_UPDATE) {
+               sync_dirty_buffer(bitmap_bh);
+               sync_dirty_buffer(gdp_bh);
+       }
+
+out_err:
+       brelse(bitmap_bh);
+       return err;
+}
 
 /*
  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
@@ -3962,13 +4058,13 @@ static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                                handle_t *handle, unsigned int reserv_clstrs)
 {
-       struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
-       struct buffer_head *gdp_bh;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block;
        int err, len;
+       int flags = 0;
+       ext4_grpblk_t changed;
 
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -3976,32 +4072,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
 
-       bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
-       if (IS_ERR(bitmap_bh)) {
-               return PTR_ERR(bitmap_bh);
-       }
-
-       BUFFER_TRACE(bitmap_bh, "getting write access");
-       err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
-                                           EXT4_JTR_NONE);
-       if (err)
-               goto out_err;
-
-       err = -EIO;
-       gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+       gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
        if (!gdp)
-               goto out_err;
-
+               return -EIO;
        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                        ext4_free_group_clusters(sb, gdp));
 
-       BUFFER_TRACE(gdp_bh, "get_write_access");
-       err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
-       if (err)
-               goto out_err;
-
        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-
        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
@@ -4010,41 +4087,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                 * Fix the bitmap and return EFSCORRUPTED
                 * We leak some of the blocks here.
                 */
-               ext4_lock_group(sb, ac->ac_b_ex.fe_group);
-               mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                             ac->ac_b_ex.fe_len);
-               ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-               err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+               err = ext4_mb_mark_context(handle, sb, true,
+                                          ac->ac_b_ex.fe_group,
+                                          ac->ac_b_ex.fe_start,
+                                          ac->ac_b_ex.fe_len,
+                                          0, NULL);
                if (!err)
                        err = -EFSCORRUPTED;
-               goto out_err;
+               return err;
        }
 
-       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
-       {
-               int i;
-               for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
-                       BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
-                                               bitmap_bh->b_data));
-               }
-       }
+       flags |= EXT4_MB_BITMAP_MARKED_CHECK;
 #endif
-       mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                     ac->ac_b_ex.fe_len);
-       if (ext4_has_group_desc_csum(sb) &&
-           (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
-               gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-               ext4_free_group_clusters_set(sb, gdp,
-                                            ext4_free_clusters_after_init(sb,
-                                               ac->ac_b_ex.fe_group, gdp));
-       }
-       len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
-       ext4_free_group_clusters_set(sb, gdp, len);
-       ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-       ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+       err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
+                                  ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
+                                  flags, &changed);
+
+       if (err && changed == 0)
+               return err;
 
-       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+       BUG_ON(changed != ac->ac_b_ex.fe_len);
+#endif
        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
@@ -4054,21 +4119,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   reserv_clstrs);
 
-       if (sbi->s_log_groups_per_flex) {
-               ext4_group_t flex_group = ext4_flex_group(sbi,
-                                                         ac->ac_b_ex.fe_group);
-               atomic64_sub(ac->ac_b_ex.fe_len,
-                            &sbi_array_rcu_deref(sbi, s_flex_groups,
-                                                 flex_group)->free_clusters);
-       }
-
-       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-       if (err)
-               goto out_err;
-       err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
-
-out_err:
-       brelse(bitmap_bh);
        return err;
 }
 
@@ -4077,17 +4127,13 @@ out_err:
  * blocks in bitmaps and update counters.
  */
 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
-                       int len, int state)
+                    int len, bool state)
 {
-       struct buffer_head *bitmap_bh = NULL;
-       struct ext4_group_desc *gdp;
-       struct buffer_head *gdp_bh;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ext4_grpblk_t blkoff;
-       int i, err = 0;
-       int already;
-       unsigned int clen, clen_changed, thisgrp_len;
+       int err = 0;
+       unsigned int clen, thisgrp_len;
 
        while (len > 0) {
                ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
@@ -4108,80 +4154,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                        ext4_error(sb, "Marking blocks in system zone - "
                                   "Block = %llu, len = %u",
                                   block, thisgrp_len);
-                       bitmap_bh = NULL;
-                       break;
-               }
-
-               bitmap_bh = ext4_read_block_bitmap(sb, group);
-               if (IS_ERR(bitmap_bh)) {
-                       err = PTR_ERR(bitmap_bh);
-                       bitmap_bh = NULL;
                        break;
                }
 
-               err = -EIO;
-               gdp = ext4_get_group_desc(sb, group, &gdp_bh);
-               if (!gdp)
-                       break;
-
-               ext4_lock_group(sb, group);
-               already = 0;
-               for (i = 0; i < clen; i++)
-                       if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
-                                        !state)
-                               already++;
-
-               clen_changed = clen - already;
-               if (state)
-                       mb_set_bits(bitmap_bh->b_data, blkoff, clen);
-               else
-                       mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
-               if (ext4_has_group_desc_csum(sb) &&
-                   (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
-                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-                       ext4_free_group_clusters_set(sb, gdp,
-                            ext4_free_clusters_after_init(sb, group, gdp));
-               }
-               if (state)
-                       clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
-               else
-                       clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
-
-               ext4_free_group_clusters_set(sb, gdp, clen);
-               ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-               ext4_group_desc_csum_set(sb, group, gdp);
-
-               ext4_unlock_group(sb, group);
-
-               if (sbi->s_log_groups_per_flex) {
-                       ext4_group_t flex_group = ext4_flex_group(sbi, group);
-                       struct flex_groups *fg = sbi_array_rcu_deref(sbi,
-                                                  s_flex_groups, flex_group);
-
-                       if (state)
-                               atomic64_sub(clen_changed, &fg->free_clusters);
-                       else
-                               atomic64_add(clen_changed, &fg->free_clusters);
-
-               }
-
-               err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
-               if (err)
-                       break;
-               sync_dirty_buffer(bitmap_bh);
-               err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
-               sync_dirty_buffer(gdp_bh);
+               err = ext4_mb_mark_context(NULL, sb, state,
+                                          group, blkoff, clen,
+                                          EXT4_MB_BITMAP_MARKED_CHECK |
+                                          EXT4_MB_SYNC_UPDATE,
+                                          NULL);
                if (err)
                        break;
 
                block += thisgrp_len;
                len -= thisgrp_len;
-               brelse(bitmap_bh);
                BUG_ON(len < 0);
        }
-
-       if (err)
-               brelse(bitmap_bh);
 }
 
 /*
@@ -4958,31 +4945,6 @@ try_group_pa:
        return false;
 }
 
-/*
- * the function goes through all block freed in the group
- * but not yet committed and marks them used in in-core bitmap.
- * buddy must be generated from this bitmap
- * Need to be called with the ext4 group lock held
- */
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
-                                               ext4_group_t group)
-{
-       struct rb_node *n;
-       struct ext4_group_info *grp;
-       struct ext4_free_data *entry;
-
-       grp = ext4_get_group_info(sb, group);
-       if (!grp)
-               return;
-       n = rb_first(&(grp->bb_free_root));
-
-       while (n) {
-               entry = rb_entry(n, struct ext4_free_data, efd_node);
-               mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
-               n = rb_next(n);
-       }
-}
-
 /*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
@@ -6130,7 +6092,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
        }
 
        block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
-       ext4_mb_mark_bb(sb, block, 1, 1);
+       ext4_mb_mark_bb(sb, block, 1, true);
        ar->len = 1;
 
        return block;
@@ -6378,7 +6340,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        }
 
        spin_lock(&sbi->s_md_lock);
-       list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
+       list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
        sbi->s_mb_free_pending += clusters;
        spin_unlock(&sbi->s_md_lock);
 }
@@ -6386,43 +6348,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
                                        unsigned long count)
 {
-       struct buffer_head *bitmap_bh;
        struct super_block *sb = inode->i_sb;
-       struct ext4_group_desc *gdp;
-       struct buffer_head *gdp_bh;
        ext4_group_t group;
        ext4_grpblk_t blkoff;
-       int already_freed = 0, err, i;
 
        ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
-       bitmap_bh = ext4_read_block_bitmap(sb, group);
-       if (IS_ERR(bitmap_bh)) {
-               pr_warn("Failed to read block bitmap\n");
-               return;
-       }
-       gdp = ext4_get_group_desc(sb, group, &gdp_bh);
-       if (!gdp)
-               goto err_out;
-
-       for (i = 0; i < count; i++) {
-               if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
-                       already_freed++;
-       }
-       mb_clear_bits(bitmap_bh->b_data, blkoff, count);
-       err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
-       if (err)
-               goto err_out;
-       ext4_free_group_clusters_set(
-               sb, gdp, ext4_free_group_clusters(sb, gdp) +
-               count - already_freed);
-       ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-       ext4_group_desc_csum_set(sb, group, gdp);
-       ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
-       sync_dirty_buffer(bitmap_bh);
-       sync_dirty_buffer(gdp_bh);
-
-err_out:
-       brelse(bitmap_bh);
+       ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
+                            EXT4_MB_BITMAP_MARKED_CHECK |
+                            EXT4_MB_SYNC_UPDATE,
+                            NULL);
 }
 
 /**
@@ -6438,19 +6372,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
                               ext4_fsblk_t block, unsigned long count,
                               int flags)
 {
-       struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-       struct ext4_group_desc *gdp;
        struct ext4_group_info *grp;
        unsigned int overflow;
        ext4_grpblk_t bit;
-       struct buffer_head *gd_bh;
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
        struct ext4_buddy e4b;
        unsigned int count_clusters;
        int err = 0;
-       int ret;
+       int mark_flags = 0;
+       ext4_grpblk_t changed;
 
        sbi = EXT4_SB(sb);
 
@@ -6459,7 +6391,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
-               goto error_return;
+               goto error_out;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;
 
@@ -6483,55 +6415,35 @@ do_more:
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        count_clusters = EXT4_NUM_B2C(sbi, count);
-       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-       if (IS_ERR(bitmap_bh)) {
-               err = PTR_ERR(bitmap_bh);
-               bitmap_bh = NULL;
-               goto error_return;
-       }
-       gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
-       if (!gdp) {
-               err = -EIO;
-               goto error_return;
-       }
+       trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+
+       /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+       err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+                                    GFP_NOFS|__GFP_NOFAIL);
+       if (err)
+               goto error_out;
 
        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
-               goto error_return;
+               goto error_clean;
        }
 
-       BUFFER_TRACE(bitmap_bh, "getting write access");
-       err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
-                                           EXT4_JTR_NONE);
-       if (err)
-               goto error_return;
-
-       /*
-        * We are about to modify some metadata.  Call the journal APIs
-        * to unshare ->b_data if a currently-committing transaction is
-        * using it
-        */
-       BUFFER_TRACE(gd_bh, "get_write_access");
-       err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
-       if (err)
-               goto error_return;
 #ifdef AGGRESSIVE_CHECK
-       {
-               int i;
-               for (i = 0; i < count_clusters; i++)
-                       BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
-       }
+       mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
 #endif
-       trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+       err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+                                  count_clusters, mark_flags, &changed);
 
-       /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
-       err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
-                                    GFP_NOFS|__GFP_NOFAIL);
-       if (err)
-               goto error_return;
+
+       if (err && changed == 0)
+               goto error_clean;
+
+#ifdef AGGRESSIVE_CHECK
+       BUG_ON(changed != count_clusters);
+#endif
 
        /*
         * We need to make sure we don't reuse the freed block until after the
@@ -6555,13 +6467,8 @@ do_more:
                new_entry->efd_tid = handle->h_transaction->t_tid;
 
                ext4_lock_group(sb, block_group);
-               mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
        } else {
-               /* need to update group_info->bb_free and bitmap
-                * with group lock held. generate_buddy look at
-                * them with group lock_held
-                */
                if (test_opt(sb, DISCARD)) {
                        err = ext4_issue_discard(sb, block_group, bit,
                                                 count_clusters, NULL);
@@ -6574,23 +6481,11 @@ do_more:
                        EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
 
                ext4_lock_group(sb, block_group);
-               mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                mb_free_blocks(inode, &e4b, bit, count_clusters);
        }
 
-       ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
-       ext4_free_group_clusters_set(sb, gdp, ret);
-       ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-       ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);
 
-       if (sbi->s_log_groups_per_flex) {
-               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic64_add(count_clusters,
-                            &sbi_array_rcu_deref(sbi, s_flex_groups,
-                                                 flex_group)->free_clusters);
-       }
-
        /*
         * on a bigalloc file system, defer the s_freeclusters_counter
         * update to the caller (ext4_remove_space and friends) so they
@@ -6603,28 +6498,18 @@ do_more:
                                   count_clusters);
        }
 
-       ext4_mb_unload_buddy(&e4b);
-
-       /* We dirtied the bitmap block */
-       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-       /* And the group descriptor block */
-       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-       if (!err)
-               err = ret;
-
        if (overflow && !err) {
                block += count;
                count = overflow;
-               put_bh(bitmap_bh);
+               ext4_mb_unload_buddy(&e4b);
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
                goto do_more;
        }
-error_return:
-       brelse(bitmap_bh);
+
+error_clean:
+       ext4_mb_unload_buddy(&e4b);
+error_out:
        ext4_std_error(sb, err);
 }
 
@@ -6742,23 +6627,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                         ext4_fsblk_t block, unsigned long count)
 {
-       struct buffer_head *bitmap_bh = NULL;
-       struct buffer_head *gd_bh;
        ext4_group_t block_group;
        ext4_grpblk_t bit;
-       unsigned int i;
-       struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_buddy e4b;
-       int err = 0, ret, free_clusters_count;
-       ext4_grpblk_t clusters_freed;
+       int err = 0;
        ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
        ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
        unsigned long cluster_count = last_cluster - first_cluster + 1;
+       ext4_grpblk_t changed;
 
        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
-       if (count == 0)
+       if (cluster_count == 0)
                return 0;
 
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -6770,99 +6651,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                ext4_warning(sb, "too many blocks added to group %u",
                             block_group);
                err = -EINVAL;
-               goto error_return;
-       }
-
-       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-       if (IS_ERR(bitmap_bh)) {
-               err = PTR_ERR(bitmap_bh);
-               bitmap_bh = NULL;
-               goto error_return;
+               goto error_out;
        }
 
-       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-       if (!desc) {
-               err = -EIO;
-               goto error_return;
-       }
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+               goto error_out;
 
        if (!ext4_sb_block_valid(sb, NULL, block, count)) {
                ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                err = -EINVAL;
-               goto error_return;
+               goto error_clean;
        }
 
-       BUFFER_TRACE(bitmap_bh, "getting write access");
-       err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
-                                           EXT4_JTR_NONE);
-       if (err)
-               goto error_return;
-
-       /*
-        * We are about to modify some metadata.  Call the journal APIs
-        * to unshare ->b_data if a currently-committing transaction is
-        * using it
-        */
-       BUFFER_TRACE(gd_bh, "get_write_access");
-       err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
-       if (err)
-               goto error_return;
-
-       for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
-               BUFFER_TRACE(bitmap_bh, "clear bit");
-               if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
-                       ext4_error(sb, "bit already cleared for block %llu",
-                                  (ext4_fsblk_t)(block + i));
-                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
-               } else {
-                       clusters_freed++;
-               }
-       }
+       err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+                                  cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
+                                  &changed);
+       if (err && changed == 0)
+               goto error_clean;
 
-       err = ext4_mb_load_buddy(sb, block_group, &e4b);
-       if (err)
-               goto error_return;
+       if (changed != cluster_count)
+               ext4_error(sb, "bit already cleared in group %u", block_group);
 
-       /*
-        * need to update group_info->bb_free and bitmap
-        * with group lock held. generate_buddy look at
-        * them with group lock_held
-        */
        ext4_lock_group(sb, block_group);
-       mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
        mb_free_blocks(NULL, &e4b, bit, cluster_count);
-       free_clusters_count = clusters_freed +
-               ext4_free_group_clusters(sb, desc);
-       ext4_free_group_clusters_set(sb, desc, free_clusters_count);
-       ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
-       ext4_group_desc_csum_set(sb, block_group, desc);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter,
-                          clusters_freed);
-
-       if (sbi->s_log_groups_per_flex) {
-               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic64_add(clusters_freed,
-                            &sbi_array_rcu_deref(sbi, s_flex_groups,
-                                                 flex_group)->free_clusters);
-       }
+                          changed);
 
+error_clean:
        ext4_mb_unload_buddy(&e4b);
-
-       /* We dirtied the bitmap block */
-       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-       /* And the group descriptor block */
-       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-       if (!err)
-               err = ret;
-
-error_return:
-       brelse(bitmap_bh);
+error_out:
        ext4_std_error(sb, err);
        return err;
 }
@@ -7170,3 +6991,7 @@ out_unload:
 
        return error;
 }
+
+#ifdef CONFIG_EXT4_KUNIT_TESTS
+#include "mballoc-test.c"
+#endif
index 057d744672935f3eed3a1115bbf81a8e020050a0..d252935f9c8abbf72217451398287919b69d4c09 100644 (file)
@@ -2280,8 +2280,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
        top = data2 + len;
        while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
-                                        (data2 + (blocksize - csum_size) -
-                                         (char *) de))) {
+                                       (char *)de - data2)) {
                        brelse(bh2);
                        brelse(bh);
                        return -EFSCORRUPTED;
index 0361c20910def732bce78888e7c71ab97da63a76..4fe061edefddecc0fa1ac522bf8a2ff216315380 100644 (file)
@@ -10,8 +10,6 @@
  */
 
 
-#define EXT4FS_DEBUG
-
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/jiffies.h>
@@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb)
         * If the reserved GDT blocks is non-zero, the resize_inode feature
         * should always be set.
         */
-       if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+       if (sbi->s_es->s_reserved_gdt_blocks &&
            !ext4_has_feature_resize_inode(sb)) {
                ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
                return -EFSCORRUPTED;
@@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb)
          * bad time to do it anyways.
          */
        if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
-           le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+           le32_to_cpu(sbi->s_es->s_first_data_block)) {
                ext4_warning(sb, "won't resize using backup superblock at %llu",
-                       (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+                       (unsigned long long)sbi->s_sbh->b_blocknr);
                return -EPERM;
        }
 
@@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb)
         * We are not allowed to do online-resizing on a filesystem mounted
         * with error, because it can destroy the filesystem easily.
         */
-       if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+       if (sbi->s_mount_state & EXT4_ERROR_FS) {
                ext4_warning(sb, "There are errors in the filesystem, "
                             "so online resizing is not allowed");
                return -EPERM;
@@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb)
        }
 
        if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
-                                 &EXT4_SB(sb)->s_ext4_flags))
+                                 &sbi->s_ext4_flags))
                ret = -EBUSY;
 
        return ret;
@@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups)
        return 0;
 }
 
-static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
-                                            ext4_group_t group) {
-       return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
-              EXT4_DESC_PER_BLOCK_BITS(sb);
-}
-
-static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
-                                            ext4_group_t group) {
-       group = ext4_meta_bg_first_group(sb, group);
-       return ext4_group_first_block_no(sb, group);
-}
-
 static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
                                                ext4_group_t group) {
        ext4_grpblk_t overhead;
@@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb,
 
        overhead = ext4_group_overhead_blocks(sb, group);
        metaend = start + overhead;
-       input->free_clusters_count = free_blocks_count =
-               input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+       free_blocks_count = input->blocks_count - 2 - overhead -
+                           sbi->s_itb_per_group;
+       input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count);
 
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
@@ -460,8 +447,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
 
        ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
                   last_cluster);
-       for (count2 = count; count > 0;
-            count -= count2, first_cluster += count2) {
+       for (; count > 0; count -= count2, first_cluster += count2) {
                ext4_fsblk_t start;
                struct buffer_head *bh;
                ext4_group_t group;
@@ -560,13 +546,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
                if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
                        goto handle_itb;
 
-               if (meta_bg == 1) {
-                       ext4_group_t first_group;
-                       first_group = ext4_meta_bg_first_group(sb, group);
-                       if (first_group != group + 1 &&
-                           first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
-                               goto handle_itb;
-               }
+               if (meta_bg == 1)
+                       goto handle_itb;
 
                block = start + ext4_bg_has_super(sb, group);
                /* Copy all of the GDT blocks into the backup in this group */
@@ -614,7 +595,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
                }
 
 handle_itb:
-               /* Initialize group tables of the grop @group */
+               /* Initialize group tables of the group @group */
                if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
                        goto handle_bb;
 
@@ -704,16 +685,14 @@ handle_ib:
                        block = start;
                }
 
-               if (count) {
-                       err = set_flexbg_block_bitmap(sb, handle,
-                                                     flex_gd,
-                                                     EXT4_B2C(sbi, start),
-                                                     EXT4_B2C(sbi,
-                                                              start + count
-                                                              - 1));
-                       if (err)
-                               goto out;
-               }
+               err = set_flexbg_block_bitmap(sb, handle,
+                               flex_gd,
+                               EXT4_B2C(sbi, start),
+                               EXT4_B2C(sbi,
+                                       start + count
+                                       - 1));
+               if (err)
+                       goto out;
        }
 
 out:
@@ -952,7 +931,13 @@ errout:
 }
 
 /*
- * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ * If there is no available space in the existing block group descriptors for
+ * the new block group and there are no reserved block group descriptors, then
+ * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set
+ * to the first block group that is managed using meta_bg and s_first_meta_bg
+ * must be a multiple of EXT4_DESC_PER_BLOCK(sb).
+ * This function will be called when first group of meta_bg is added to bring
+ * new group descriptors block of new added meta_bg.
  */
 static int add_new_gdb_meta_bg(struct super_block *sb,
                               handle_t *handle, ext4_group_t group) {
@@ -962,8 +947,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
        unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
        int err;
 
-       gdblock = ext4_meta_bg_first_block_no(sb, group) +
-                  ext4_bg_has_super(sb, group);
+       gdblock = ext4_group_first_block_no(sb, group) +
+                 ext4_bg_has_super(sb, group);
        gdb_bh = ext4_sb_bread(sb, gdblock, 0);
        if (IS_ERR(gdb_bh))
                return PTR_ERR(gdb_bh);
@@ -1087,9 +1072,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        for (i = 0; i < reserved_gdb; i++) {
                int err2;
                data = (__le32 *)primary[i]->b_data;
-               /* printk("reserving backup %lu[%u] = %lu\n",
-                      primary[i]->b_blocknr, gdbackups,
-                      blk + primary[i]->b_blocknr); */
                data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
                err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
                if (!err)
@@ -1191,8 +1173,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
                           ext4_group_first_block_no(sb, group));
                BUFFER_TRACE(bh, "get_write_access");
                if ((err = ext4_journal_get_write_access(handle, sb, bh,
-                                                        EXT4_JTR_NONE)))
+                                                        EXT4_JTR_NONE))) {
+                       brelse(bh);
                        break;
+               }
                lock_buffer(bh);
                memcpy(bh->b_data, data, size);
                if (rest)
@@ -1601,7 +1585,8 @@ exit_journal:
                int gdb_num_end = ((group + flex_gd->count - 1) /
                                   EXT4_DESC_PER_BLOCK(sb));
                int meta_bg = ext4_has_feature_meta_bg(sb);
-               sector_t old_gdb = 0;
+               sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
+                                        ext4_group_first_block_no(sb, 0);
 
                update_backups(sb, ext4_group_first_block_no(sb, 0),
                               (char *)es, sizeof(struct ext4_super_block), 0);
@@ -1610,11 +1595,8 @@ exit_journal:
 
                        gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
                                                     gdb_num);
-                       if (old_gdb == gdb_bh->b_blocknr)
-                               continue;
-                       update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
-                                      gdb_bh->b_size, meta_bg);
-                       old_gdb = gdb_bh->b_blocknr;
+                       update_backups(sb, gdb_bh->b_blocknr - padding_blocks,
+                                      gdb_bh->b_data, gdb_bh->b_size, meta_bg);
                }
        }
 exit:
@@ -1980,9 +1962,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
 
 errout:
        ret = ext4_journal_stop(handle);
-       if (!err)
-               err = ret;
-       return ret;
+       return err ? err : ret;
 
 invalid_resize_inode:
        ext4_error(sb, "corrupted/inconsistent resize inode");
index 42a44990d99c7e96a104dbebfe7cee578129add8..77e2b694c7d5d14a1451795ad2ae5677884b7323 100644 (file)
@@ -768,7 +768,8 @@ static void update_super_work(struct work_struct *work)
         */
        if (!sb_rdonly(sbi->s_sb) && journal) {
                struct buffer_head *sbh = sbi->s_sbh;
-               bool call_notify_err;
+               bool call_notify_err = false;
+
                handle = jbd2_journal_start(journal, 1);
                if (IS_ERR(handle))
                        goto write_directly;
@@ -6444,6 +6445,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
        struct ext4_mount_options old_opts;
        ext4_group_t g;
        int err = 0;
+       int alloc_ctx;
 #ifdef CONFIG_QUOTA
        int enable_quota = 0;
        int i, j;
@@ -6484,7 +6486,16 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
 
        }
 
+       /*
+        * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
+        * two calls to ext4_should_dioread_nolock() to return inconsistent
+        * values, triggering WARN_ON in ext4_add_complete_io(). we grab
+        * here s_writepages_rwsem to avoid race between writepages ops and
+        * remount.
+        */
+       alloc_ctx = ext4_writepages_down_write(sb);
        ext4_apply_options(fc, sb);
+       ext4_writepages_up_write(sb, alloc_ctx);
 
        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
            test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -6702,6 +6713,8 @@ restore_opts:
        if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
            sb_any_quota_suspended(sb))
                dquot_resume(sb, -1);
+
+       alloc_ctx = ext4_writepages_down_write(sb);
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -6710,6 +6723,8 @@ restore_opts:
        sbi->s_commit_interval = old_opts.s_commit_interval;
        sbi->s_min_batch_time = old_opts.s_min_batch_time;
        sbi->s_max_batch_time = old_opts.s_max_batch_time;
+       ext4_writepages_up_write(sb, alloc_ctx);
+
        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);
 #ifdef CONFIG_QUOTA
index c269a7d29a46533ca107492d067657debaf7506c..01f744cb97a401d0d13305983f5ea5617921c599 100644 (file)
@@ -289,6 +289,8 @@ int jbd2_journal_recover(journal_t *journal)
        journal_superblock_t *  sb;
 
        struct recovery_info    info;
+       errseq_t                wb_err;
+       struct address_space    *mapping;
 
        memset(&info, 0, sizeof(info));
        sb = journal->j_superblock;
@@ -306,6 +308,9 @@ int jbd2_journal_recover(journal_t *journal)
                return 0;
        }
 
+       wb_err = 0;
+       mapping = journal->j_fs_dev->bd_inode->i_mapping;
+       errseq_check_and_advance(&mapping->wb_err, &wb_err);
        err = do_one_pass(journal, &info, PASS_SCAN);
        if (!err)
                err = do_one_pass(journal, &info, PASS_REVOKE);
@@ -327,6 +332,9 @@ int jbd2_journal_recover(journal_t *journal)
 
        jbd2_journal_clear_revoke(journal);
        err2 = sync_blockdev(journal->j_fs_dev);
+       if (!err)
+               err = err2;
+       err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
        if (!err)
                err = err2;
        /* Make sure all replayed data is on permanent storage */
@@ -632,7 +640,7 @@ static int do_one_pass(journal_t *journal,
                                        success = err;
                                        printk(KERN_ERR
                                                "JBD2: IO error %d recovering "
-                                               "block %ld in log\n",
+                                               "block %lu in log\n",
                                                err, io_block);
                                } else {
                                        unsigned long long blocknr;
@@ -661,7 +669,8 @@ static int do_one_pass(journal_t *journal,
                                                printk(KERN_ERR "JBD2: Invalid "
                                                       "checksum recovering "
                                                       "data block %llu in "
-                                                      "log\n", blocknr);
+                                                      "journal block %lu\n",
+                                                      blocknr, io_block);
                                                block_error = 1;
                                                goto skip_write;
                                        }