ext4: Convert to use mapping->invalidate_lock
authorJan Kara <jack@suse.cz>
Thu, 4 Feb 2021 17:05:42 +0000 (18:05 +0100)
committerJan Kara <jack@suse.cz>
Tue, 13 Jul 2021 12:29:00 +0000 (14:29 +0200)
Convert ext4 to use mapping->invalidate_lock instead of its private
EXT4_I(inode)->i_mmap_sem. This is mostly search-and-replace. By this
conversion we fix a long standing race between hole punching and read(2)
/ readahead(2) paths that can lead to stale page cache contents.

CC: <linux-ext4@vger.kernel.org>
CC: Ted Tso <tytso@mit.edu>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/super.c
fs/ext4/truncate.h

index 3c51e243450db76494cc6f9f4dd8f713c27fc6f4..7ebaf66b6e313e2839266d9b436ef7a2a42047b8 100644 (file)
@@ -1086,15 +1086,6 @@ struct ext4_inode_info {
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
-       /*
-        * i_mmap_sem is for serializing page faults with truncate / punch hole
-        * operations. We have to make sure that new page cannot be faulted in
-        * a section of the inode that is being punched. We cannot easily use
-        * i_data_sem for this since we need protection for the whole punch
-        * operation and i_data_sem ranks below transaction start so we have
-        * to occasionally drop it.
-        */
-       struct rw_semaphore i_mmap_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;
 
@@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
 extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_release_space(struct inode *inode, int to_free);
index 92ad64b89d9b52e14a7d1295d5c69a59773953de..c33e0a2cb6c389558fe391ae654c81946e677ae5 100644 (file)
@@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                            loff_t len, int mode)
 {
        struct inode *inode = file_inode(file);
+       struct address_space *mapping = file->f_mapping;
        handle_t *handle = NULL;
        unsigned int max_blocks;
        loff_t new_size = 0;
@@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                 * Prevent page faults from reinstantiating pages we have
                 * released from page cache.
                 */
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(mapping);
 
                ret = ext4_break_layouts(inode);
                if (ret) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                        goto out_mutex;
                }
 
                ret = ext4_update_disksize_before_punch(inode, offset, len);
                if (ret) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(mapping);
                        goto out_mutex;
                }
                /* Now release the pages and zero block aligned part of pages */
@@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 
                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                             flags);
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
                if (ret)
                        goto out_mutex;
        }
@@ -5221,6 +5222,7 @@ out:
 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 {
        struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
        ext4_lblk_t punch_start, punch_stop;
        handle_t *handle;
        unsigned int credits;
@@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
 
        ret = ext4_break_layouts(inode);
        if (ret)
@@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
         * Write tail of the last page before removed range since it will get
         * removed from the page cache below.
         */
-       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+       ret = filemap_write_and_wait_range(mapping, ioffset, offset);
        if (ret)
                goto out_mmap;
        /*
         * Write data that will be shifted to preserve them when discarding
         * page cache below. We are also protected from pages becoming dirty
-        * by i_mmap_sem.
+        * by i_rwsem and invalidate_lock.
         */
-       ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+       ret = filemap_write_and_wait_range(mapping, offset + len,
                                           LLONG_MAX);
        if (ret)
                goto out_mmap;
@@ -5350,7 +5352,7 @@ out_stop:
        ext4_journal_stop(handle);
        ext4_fc_stop_ineligible(sb);
 out_mmap:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
 out_mutex:
        inode_unlock(inode);
        return ret;
@@ -5367,6 +5369,7 @@ out_mutex:
 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 {
        struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        struct ext4_ext_path *path;
        struct ext4_extent *extent;
@@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
 
        ret = ext4_break_layouts(inode);
        if (ret)
@@ -5526,7 +5529,7 @@ out_stop:
        ext4_journal_stop(handle);
        ext4_fc_stop_ineligible(sb);
 out_mmap:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
 out_mutex:
        inode_unlock(inode);
        return ret;
index 816dedcbd541e025432ee5606bea242512397169..d3b4ed91aa6828b60ec351615148a157417955aa 100644 (file)
@@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
         */
        bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                (vmf->vma->vm_flags & VM_SHARED);
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        pfn_t pfn;
 
        if (write) {
                sb_start_pagefault(sb);
                file_update_time(vmf->vma->vm_file);
-               down_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock_shared(mapping);
 retry:
                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                               EXT4_DATA_TRANS_BLOCKS(sb));
                if (IS_ERR(handle)) {
-                       up_read(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock_shared(mapping);
                        sb_end_pagefault(sb);
                        return VM_FAULT_SIGBUS;
                }
        } else {
-               down_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock_shared(mapping);
        }
        result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
        if (write) {
@@ -731,10 +732,10 @@ retry:
                /* Handling synchronous page fault? */
                if (result & VM_FAULT_NEEDDSYNC)
                        result = dax_finish_sync_fault(vmf, pe_size, pfn);
-               up_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock_shared(mapping);
                sb_end_pagefault(sb);
        } else {
-               up_read(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock_shared(mapping);
        }
 
        return result;
@@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
 #endif
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
-       .fault          = ext4_filemap_fault,
+       .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
 };
index d8de607849df3e20a8eae11f53dd019fbcb63185..325c038e7b232b3babf20171477f88d5dd7da420 100644 (file)
@@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
        return ret;
 }
 
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static void ext4_wait_dax_page(struct inode *inode)
 {
-       up_write(&ei->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
        schedule();
-       down_write(&ei->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
 }
 
 int ext4_break_layouts(struct inode *inode)
 {
-       struct ext4_inode_info *ei = EXT4_I(inode);
        struct page *page;
        int error;
 
-       if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+       if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
                return -EINVAL;
 
        do {
@@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)
                error = ___wait_var_event(&page->_refcount,
                                atomic_read(&page->_refcount) == 1,
                                TASK_INTERRUPTIBLE, 0, 0,
-                               ext4_wait_dax_page(ei));
+                               ext4_wait_dax_page(inode));
        } while (error == 0);
 
        return error;
@@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        if (ext4_has_inline_data(inode)) {
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(mapping);
                ret = ext4_convert_inline_data(inode);
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(mapping);
                if (ret)
                        return ret;
        }
@@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(mapping);
 
        ret = ext4_break_layouts(inode);
        if (ret)
@@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 out_stop:
        ext4_journal_stop(handle);
 out_dio:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
 out_mutex:
        inode_unlock(inode);
        return ret;
@@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                        inode_dio_wait(inode);
                }
 
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(inode->i_mapping);
 
                rc = ext4_break_layouts(inode);
                if (rc) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(inode->i_mapping);
                        goto err_out;
                }
 
@@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                                error = rc;
                }
 out_mmap_sem:
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
        }
 
        if (!error) {
@@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         * data (and journalled aops don't know how to handle these cases).
         */
        if (val) {
-               down_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_lock(inode->i_mapping);
                err = filemap_write_and_wait(inode->i_mapping);
                if (err < 0) {
-                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       filemap_invalidate_unlock(inode->i_mapping);
                        return err;
                }
        }
@@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        percpu_up_write(&sbi->s_writepages_rwsem);
 
        if (val)
-               up_write(&EXT4_I(inode)->i_mmap_sem);
+               filemap_invalidate_unlock(inode->i_mapping);
 
        /* Finally we can mark the inode as dirty. */
 
@@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);
 
-       down_read(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock_shared(mapping);
 
        err = ext4_convert_inline_data(inode);
        if (err)
@@ -6176,7 +6175,7 @@ retry_alloc:
 out_ret:
        ret = block_page_mkwrite_return(err);
 out:
-       up_read(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock_shared(mapping);
        sb_end_pagefault(inode->i_sb);
        return ret;
 out_error:
@@ -6184,15 +6183,3 @@ out_error:
        ext4_journal_stop(handle);
        goto out;
 }
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
-       struct inode *inode = file_inode(vmf->vma->vm_file);
-       vm_fault_t ret;
-
-       down_read(&EXT4_I(inode)->i_mmap_sem);
-       ret = filemap_fault(vmf);
-       up_read(&EXT4_I(inode)->i_mmap_sem);
-
-       return ret;
-}
index 6eed6170aded6739dee6f72fb2fdf0b5d1bb5395..4fb5fe083c2bcb75d5cd0c3a8ecd53b314a97729 100644 (file)
@@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
                goto journal_err_out;
        }
 
-       down_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_lock(inode->i_mapping);
        err = filemap_write_and_wait(inode->i_mapping);
        if (err)
                goto err_out;
@@ -256,7 +256,7 @@ err_out1:
        ext4_double_up_write_data_sem(inode, inode_bl);
 
 err_out:
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(inode->i_mapping);
 journal_err_out:
        unlock_two_nondirectories(inode, inode_bl);
        iput(inode_bl);
index dfa09a277b56fb4a1add7b5c28d8e75030c65023..d6df62fc810c886de1dcfbe375d6b57a4a576ae2 100644 (file)
@@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 /*
  * Lock ordering
  *
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
  * page fault path:
- * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- *   page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ *   -> page lock -> i_data_sem (rw)
  *
  * buffered write path:
  * sb_start_write -> i_mutex -> mmap_lock
@@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
  *   i_data_sem (rw)
  *
  * truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ *   page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
  *   i_data_sem (rw)
  *
  * direct IO:
@@ -1360,7 +1358,6 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&ei->i_orphan);
        init_rwsem(&ei->xattr_sem);
        init_rwsem(&ei->i_data_sem);
-       init_rwsem(&ei->i_mmap_sem);
        inode_init_once(&ei->vfs_inode);
        ext4_fc_init_inode(&ei->vfs_inode);
 }
index bcbe3668c1d4e1bec174014b7fde38ce987155d6..ce84aa2786c7e631fc48fecdafe9a9ccbb2be26e 100644 (file)
  */
 static inline void ext4_truncate_failed_write(struct inode *inode)
 {
+       struct address_space *mapping = inode->i_mapping;
+
        /*
         * We don't need to call ext4_break_layouts() because the blocks we
         * are truncating were never visible to userspace.
         */
-       down_write(&EXT4_I(inode)->i_mmap_sem);
-       truncate_inode_pages(inode->i_mapping, inode->i_size);
+       filemap_invalidate_lock(mapping);
+       truncate_inode_pages(mapping, inode->i_size);
        ext4_truncate(inode);
-       up_write(&EXT4_I(inode)->i_mmap_sem);
+       filemap_invalidate_unlock(mapping);
 }
 
 /*