mm: drop the assumption that VM_SHARED always implies writable
authorLorenzo Stoakes <lstoakes@gmail.com>
Thu, 12 Oct 2023 17:04:28 +0000 (18:04 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 18 Oct 2023 21:34:19 +0000 (14:34 -0700)
Patch series "permit write-sealed memfd read-only shared mappings", v4.

The man page for fcntl() describing memfd file seals states the following
about F_SEAL_WRITE:-

    Furthermore, trying to create new shared, writable memory-mappings via
    mmap(2) will also fail with EPERM.

With emphasis on 'writable'.  In turns out in fact that currently the
kernel simply disallows all new shared memory mappings for a memfd with
F_SEAL_WRITE applied, rendering this documentation inaccurate.

This matters because users are therefore unable to obtain a shared mapping
to a memfd after write sealing altogether, which limits their usefulness.
This was reported in the discussion thread [1] originating from a bug
report [2].

This is a product of both using the struct address_space->i_mmap_writable
atomic counter to determine whether writing may be permitted, and the
kernel adjusting this counter when any VM_SHARED mapping is performed and
more generally implicitly assuming VM_SHARED implies writable.

It seems sensible that we should only update this mapping if VM_MAYWRITE
is specified, i.e.  whether it is possible that this mapping could at any
point be written to.

If we do so then all we need to do to permit write seals to function as
documented is to clear VM_MAYWRITE when mapping read-only.  It turns out
this functionality already exists for F_SEAL_FUTURE_WRITE - we can
therefore simply adapt this logic to do the same for F_SEAL_WRITE.

We then hit a chicken and egg situation in mmap_region() where the check
for VM_MAYWRITE occurs before we are able to clear this flag.  To work
around this, perform this check after we invoke call_mmap(), with careful
consideration of error paths.

Thanks to Andy Lutomirski for the suggestion!

[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238

This patch (of 3):

There is a general assumption that VMAs with the VM_SHARED flag set are
writable.  If the VM_MAYWRITE flag is not set, then this is simply not the
case.

Update those checks which affect the struct address_space->i_mmap_writable
field to explicitly test for this by introducing
[vma_]is_shared_maywrite() helper functions.

This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
that the VMA cannot be written to.

Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/fs.h
include/linux/mm.h
kernel/fork.c
mm/filemap.c
mm/madvise.c
mm/mmap.c

index fd539c9fef8ea9021321a7dd3690fcfd0bc5006b..5265186da0e2022d7c355e4d12ced667e40f638c 100644 (file)
@@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;
  *   It is also used to block modification of page cache contents through
  *   memory mappings.
  * @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
  * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)
 
 /*
  * Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  *
index 1bddb151cd5c3866db50c4fede0382ca31a9f3b8..c3b5749ede9d4e13d32b43838d0b18326328c5fb 100644 (file)
@@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
        return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+       return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+               (VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+       return is_shared_maywrite(vma->vm_flags);
+}
+
 static inline
 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
 {
index e45a4457ba8349cacce46f9408e84d4dcb537228..1e6c656e08577ded42155e907d177b39cc00dc26 100644 (file)
@@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 
                        get_file(file);
                        i_mmap_lock_write(mapping);
-                       if (tmp->vm_flags & VM_SHARED)
+                       if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
index 9ef49255f1a5f522834faa3c79acb5c31acf99a9..9710f43a89acd3ade2c8289b6e9352fcf11cf9e5 100644 (file)
@@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
  */
 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 {
-       if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+       if (vma_is_shared_maywrite(vma))
                return -EINVAL;
        return generic_file_mmap(file, vma);
 }
index a6b48d4796bacaa4d338cff078d4b2d7dd859e5c..cf4d694280e98ae4d65ccd02d2bf1d6c4393a998 100644 (file)
@@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma,
                        return -EINVAL;
        }
 
-       if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+       if (!vma_is_shared_maywrite(vma))
                return -EACCES;
 
        offset = (loff_t)(start - vma->vm_start)
index 3ea52451623bec464b5307ea09ffc08287e38ac0..0041e3631f6cccaf49d7072191b378aa4f4e85c3 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
 {
-       if (vma->vm_flags & VM_SHARED)
+       if (vma_is_shared_maywrite(vma))
                mapping_unmap_writable(mapping);
 
        flush_dcache_mmap_lock(mapping);
@@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
 static void __vma_link_file(struct vm_area_struct *vma,
                            struct address_space *mapping)
 {
-       if (vma->vm_flags & VM_SHARED)
+       if (vma_is_shared_maywrite(vma))
                mapping_allow_writable(mapping);
 
        flush_dcache_mmap_lock(mapping);
@@ -2846,7 +2846,7 @@ cannot_expand:
        vma->vm_pgoff = pgoff;
 
        if (file) {
-               if (vm_flags & VM_SHARED) {
+               if (is_shared_maywrite(vm_flags)) {
                        error = mapping_map_writable(file->f_mapping);
                        if (error)
                                goto free_vma;
@@ -2920,7 +2920,7 @@ cannot_expand:
        mm->map_count++;
        if (vma->vm_file) {
                i_mmap_lock_write(vma->vm_file->f_mapping);
-               if (vma->vm_flags & VM_SHARED)
+               if (vma_is_shared_maywrite(vma))
                        mapping_allow_writable(vma->vm_file->f_mapping);
 
                flush_dcache_mmap_lock(vma->vm_file->f_mapping);
@@ -2937,7 +2937,7 @@ cannot_expand:
 
        /* Once vma denies write, undo our temporary denial count */
 unmap_writable:
-       if (file && vm_flags & VM_SHARED)
+       if (file && is_shared_maywrite(vm_flags))
                mapping_unmap_writable(file->f_mapping);
        file = vma->vm_file;
        ksm_add_vma(vma);
@@ -2985,7 +2985,7 @@ unmap_and_free_vma:
                unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
                             vma->vm_end, vma->vm_end, true);
        }
-       if (file && (vm_flags & VM_SHARED))
+       if (file && is_shared_maywrite(vm_flags))
                mapping_unmap_writable(file->f_mapping);
 free_vma:
        vm_area_free(vma);