Merge tag 'x86_shstk_for_6.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

[sfrench/cifs-2.6.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 8c0350c1134a90d4eaf4a284669f661bde4542b0..bf5d0b1b16f4341e8851d9bdfef30df631f62226 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -551,13 +551,6 @@ struct vm_fault {
                                          */
  };
  
-/* page entry size for vm->huge_fault() */
-enum page_entry_size {
-       PE_SIZE_PTE = 0,
-       PE_SIZE_PMD,
-       PE_SIZE_PUD,
-};
-
  /*
   * These are the virtual MM functions - opening of an area, closing and
   * unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -581,8 +574,7 @@ struct vm_operations_struct {
         int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                         unsigned long end, unsigned long newflags);
         vm_fault_t (*fault)(struct vm_fault *vmf);
-       vm_fault_t (*huge_fault)(struct vm_fault *vmf,
-                       enum page_entry_size pe_size);
+       vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
         vm_fault_t (*map_pages)(struct vm_fault *vmf,
                         pgoff_t start_pgoff, pgoff_t end_pgoff);
         unsigned long (*pagesize)(struct vm_area_struct * area);
@@ -660,8 +652,14 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
   */
  static inline bool vma_start_read(struct vm_area_struct *vma)
  {
-       /* Check before locking. A race might cause false locked result. */
-       if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+       /*
+        * Check before locking. A race might cause false locked result.
+        * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+        * ACQUIRE semantics, because this is just a lockless check whose result
+        * we don't rely on for anything - the mm_lock_seq read against which we
+        * need ordering is below.
+        */
+       if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
                 return false;
  
         if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@@ -672,8 +670,13 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
          * False unlocked result is impossible because we modify and check
          * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
          * modification invalidates all existing locks.
+        *
+        * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+        * racing with vma_end_write_all(), we only start reading from the VMA
+        * after it has been unlocked.
+        * This pairs with RELEASE semantics in vma_end_write_all().
          */
-       if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+       if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
                 up_read(&vma->vm_lock->lock);
                 return false;
         }
@@ -687,6 +690,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
         rcu_read_unlock();
  }
  
+/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
  static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
  {
         mmap_assert_write_locked(vma->vm_mm);
@@ -695,10 +699,15 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
          * current task is holding mmap_write_lock, both vma->vm_lock_seq and
          * mm->mm_lock_seq can't be concurrently modified.
          */
-       *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+       *mm_lock_seq = vma->vm_mm->mm_lock_seq;
         return (vma->vm_lock_seq == *mm_lock_seq);
  }
  
+/*
+ * Begin writing to a VMA.
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ */
  static inline void vma_start_write(struct vm_area_struct *vma)
  {
         int mm_lock_seq;
@@ -707,30 +716,27 @@ static inline void vma_start_write(struct vm_area_struct *vma)
                 return;
  
         down_write(&vma->vm_lock->lock);
-       vma->vm_lock_seq = mm_lock_seq;
+       /*
+        * We should use WRITE_ONCE() here because we can have concurrent reads
+        * from the early lockless pessimistic check in vma_start_read().
+        * We don't really care about the correctness of that early check, but
+        * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+        */
+       WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
         up_write(&vma->vm_lock->lock);
  }
  
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
  {
         int mm_lock_seq;
  
-       if (__is_vma_write_locked(vma, &mm_lock_seq))
-               return true;
-
-       if (!down_write_trylock(&vma->vm_lock->lock))
-               return false;
-
-       vma->vm_lock_seq = mm_lock_seq;
-       up_write(&vma->vm_lock->lock);
-       return true;
+       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
  }
  
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+static inline void vma_assert_locked(struct vm_area_struct *vma)
  {
-       int mm_lock_seq;
-
-       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+       if (!rwsem_is_locked(&vma->vm_lock->lock))
+               vma_assert_write_locked(vma);
  }
  
  static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
@@ -741,6 +747,22 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
         vma->detached = detached;
  }
  
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+               vma_end_read(vmf->vma);
+       else
+               mmap_read_unlock(vmf->vma->vm_mm);
+}
+
+static inline void assert_fault_locked(struct vm_fault *vmf)
+{
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+               vma_assert_locked(vmf->vma);
+       else
+               mmap_assert_locked(vmf->vma->vm_mm);
+}
+
  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                           unsigned long address);
  
@@ -750,25 +772,40 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
                 { return false; }
  static inline void vma_end_read(struct vm_area_struct *vma) {}
  static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
-               { return true; }
-static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+               { mmap_assert_write_locked(vma->vm_mm); }
  static inline void vma_mark_detached(struct vm_area_struct *vma,
                                      bool detached) {}
  
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+               unsigned long address)
+{
+       return NULL;
+}
+
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+       mmap_read_unlock(vmf->vma->vm_mm);
+}
+
+static inline void assert_fault_locked(struct vm_fault *vmf)
+{
+       mmap_assert_locked(vmf->vma->vm_mm);
+}
+
  #endif /* CONFIG_PER_VMA_LOCK */
  
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+
  /*
   * WARNING: vma_init does not initialize vma->vm_lock.
   * Use vm_area_alloc()/vm_area_free() if vma needs locking.
   */
  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
  {
-       static const struct vm_operations_struct dummy_vm_ops = {};
-
         memset(vma, 0, sizeof(*vma));
         vma->vm_mm = mm;
-       vma->vm_ops = &dummy_vm_ops;
+       vma->vm_ops = &vma_dummy_vm_ops;
         INIT_LIST_HEAD(&vma->anon_vma_chain);
         vma_mark_detached(vma, false);
         vma_numab_state_init(vma);
@@ -781,18 +818,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
         ACCESS_PRIVATE(vma, __vm_flags) = flags;
  }
  
-/* Use when VMA is part of the VMA tree and modifications need coordination */
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
  static inline void vm_flags_reset(struct vm_area_struct *vma,
                                   vm_flags_t flags)
  {
-       vma_start_write(vma);
+       vma_assert_write_locked(vma);
         vm_flags_init(vma, flags);
  }
  
  static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                        vm_flags_t flags)
  {
-       vma_start_write(vma);
+       vma_assert_write_locked(vma);
         WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
  }
  
@@ -841,6 +882,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
         return !vma->vm_ops;
  }
  
+/*
+ * Indicate if the VMA is a heap for the given task; for
+ * /proc/PID/maps that is the heap of the main task.
+ */
+static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
+{
+       return vma->vm_start <= vma->vm_mm->brk &&
+               vma->vm_end >= vma->vm_mm->start_brk;
+}
+
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
+{
+       /*
+        * We make no effort to guess what a given thread considers to be
+        * its "stack".  It's not even well-defined for programs written
+        * languages like Go.
+        */
+       return vma->vm_start <= vma->vm_mm->start_stack &&
+              vma->vm_end >= vma->vm_mm->start_stack;
+}
+
  static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
  {
         int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -978,7 +1044,7 @@ struct inode;
   * compound_order() can be called without holding a reference, which means
   * that niceties like page_folio() don't work.  These callers should be
   * prepared to handle wild return values.  For example, PG_head may be
- * set before _folio_order is initialised, or this may be a tail page.
+ * set before the order is initialised, or this may be a tail page.
   * See compaction.c for some good examples.
   */
  static inline unsigned int compound_order(struct page *page)
@@ -987,7 +1053,7 @@ static inline unsigned int compound_order(struct page *page)
  
         if (!test_bit(PG_head, &folio->flags))
                 return 0;
-       return folio->_folio_order;
+       return folio->_flags_1 & 0xff;
  }
  
  /**
@@ -1003,7 +1069,7 @@ static inline unsigned int folio_order(struct folio *folio)
  {
         if (!folio_test_large(folio))
                 return 0;
-       return folio->_folio_order;
+       return folio->_flags_1 & 0xff;
  }
  
  #include <linux/huge_mm.h>
@@ -1074,11 +1140,6 @@ unsigned long vmalloc_to_pfn(const void *addr);
   * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
   * is no special casing required.
   */
-
-#ifndef is_ioremap_addr
-#define is_ioremap_addr(x) is_vmalloc_addr(x)
-#endif
-
  #ifdef CONFIG_MMU
  extern bool is_vmalloc_addr(const void *x);
  extern int is_vmalloc_or_module_addr(const void *x);
@@ -1222,33 +1283,6 @@ void folio_copy(struct folio *dst, struct folio *src);
  
  unsigned long nr_free_buffer_pages(void);
  
-/*
- * Compound pages have a destructor function.  Provide a
- * prototype for that function and accessor functions.
- * These are _only_ valid on the head of a compound page.
- */
-typedef void compound_page_dtor(struct page *);
-
-/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
-enum compound_dtor_id {
-       NULL_COMPOUND_DTOR,
-       COMPOUND_PAGE_DTOR,
-#ifdef CONFIG_HUGETLB_PAGE
-       HUGETLB_PAGE_DTOR,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       TRANSHUGE_PAGE_DTOR,
-#endif
-       NR_COMPOUND_DTORS,
-};
-
-static inline void folio_set_compound_dtor(struct folio *folio,
-               enum compound_dtor_id compound_dtor)
-{
-       VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
-       folio->_folio_dtor = compound_dtor;
-}
-
  void destroy_large_folio(struct folio *folio);
  
  /* Returns the number of bytes in this potentially compound page. */
@@ -1284,8 +1318,6 @@ static inline unsigned long thp_size(struct page *page)
         return PAGE_SIZE << thp_order(page);
  }
  
-void free_compound_page(struct page *page);
-
  #ifdef CONFIG_MMU
  /*
   * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
@@ -1301,7 +1333,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  }
  
  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+               struct page *page, unsigned int nr, unsigned long addr);
  
  vm_fault_t finish_fault(struct vm_fault *vmf);
  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
@@ -2008,7 +2041,7 @@ static inline long folio_nr_pages(struct folio *folio)
  #ifdef CONFIG_64BIT
         return folio->_folio_nr_pages;
  #else
-       return 1L << folio->_folio_order;
+       return 1L << (folio->_flags_1 & 0xff);
  #endif
  }
  
@@ -2026,7 +2059,7 @@ static inline unsigned long compound_nr(struct page *page)
  #ifdef CONFIG_64BIT
         return folio->_folio_nr_pages;
  #else
-       return 1L << folio->_folio_order;
+       return 1L << (folio->_flags_1 & 0xff);
  #endif
  }
  
@@ -2172,7 +2205,6 @@ static inline void *folio_address(const struct folio *folio)
         return page_address(&folio->page);
  }
  
-extern void *page_rmapping(struct page *page);
  extern pgoff_t __page_file_index(struct page *page);
  
  /*
@@ -2239,18 +2271,6 @@ extern void pagefault_out_of_memory(void);
  #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
  #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
  
-/*
- * Flags passed to show_mem() and show_free_areas() to suppress output in
- * various contexts.
- */
-#define SHOW_MEM_FILTER_NODES          (0x0001u)       /* disallowed nodes */
-
-extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
-{
-       __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
-}
-
  /*
   * Parameter block passed down to zap_pte_range in exceptional cases.
   */
@@ -2319,9 +2339,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
         zap_page_range_single(vma, vma->vm_start,
                               vma->vm_end - vma->vm_start, NULL);
  }
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                 struct vm_area_struct *start_vma, unsigned long start,
-               unsigned long end, bool mm_wr_locked);
+               unsigned long end, unsigned long tree_end, bool mm_wr_locked);
  
  struct mmu_notifier_range;
  
@@ -2768,42 +2788,93 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  }
  #endif /* CONFIG_MMU */
  
+static inline struct ptdesc *virt_to_ptdesc(const void *x)
+{
+       return page_ptdesc(virt_to_page(x));
+}
+
+static inline void *ptdesc_to_virt(const struct ptdesc *pt)
+{
+       return page_to_virt(ptdesc_page(pt));
+}
+
+static inline void *ptdesc_address(const struct ptdesc *pt)
+{
+       return folio_address(ptdesc_folio(pt));
+}
+
+static inline bool pagetable_is_reserved(struct ptdesc *pt)
+{
+       return folio_test_reserved(ptdesc_folio(pt));
+}
+
+/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp:    GFP flags
+ * @order:  desired pagetable order
+ *
+ * pagetable_alloc allocates memory for page tables as well as a page table
+ * descriptor to describe that memory.
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+{
+       struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+       return page_ptdesc(page);
+}
+
+/**
+ * pagetable_free - Free pagetables
+ * @pt:        The page table descriptor
+ *
+ * pagetable_free frees the memory of all page tables described by a page
+ * table descriptor and the memory for the descriptor itself.
+ */
+static inline void pagetable_free(struct ptdesc *pt)
+{
+       struct page *page = ptdesc_page(pt);
+
+       __free_pages(page, compound_order(page));
+}
+
  #if USE_SPLIT_PTE_PTLOCKS
  #if ALLOC_SPLIT_PTLOCKS
  void __init ptlock_cache_init(void);
-extern bool ptlock_alloc(struct page *page);
-extern void ptlock_free(struct page *page);
+bool ptlock_alloc(struct ptdesc *ptdesc);
+void ptlock_free(struct ptdesc *ptdesc);
  
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
  {
-       return page->ptl;
+       return ptdesc->ptl;
  }
  #else /* ALLOC_SPLIT_PTLOCKS */
  static inline void ptlock_cache_init(void)
  {
  }
  
-static inline bool ptlock_alloc(struct page *page)
+static inline bool ptlock_alloc(struct ptdesc *ptdesc)
  {
         return true;
  }
  
-static inline void ptlock_free(struct page *page)
+static inline void ptlock_free(struct ptdesc *ptdesc)
  {
  }
  
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
  {
-       return &page->ptl;
+       return &ptdesc->ptl;
  }
  #endif /* ALLOC_SPLIT_PTLOCKS */
  
  static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
  {
-       return ptlock_ptr(pmd_page(*pmd));
+       return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
  }
  
-static inline bool ptlock_init(struct page *page)
+static inline bool ptlock_init(struct ptdesc *ptdesc)
  {
         /*
          * prep_new_page() initialize page->private (and therefore page->ptl)
@@ -2812,10 +2883,10 @@ static inline bool ptlock_init(struct page *page)
          * It can happen if arch try to use slab for page table allocation:
          * slab code uses page->slab_cache, which share storage with page->ptl.
          */
-       VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
-       if (!ptlock_alloc(page))
+       VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
+       if (!ptlock_alloc(ptdesc))
                 return false;
-       spin_lock_init(ptlock_ptr(page));
+       spin_lock_init(ptlock_ptr(ptdesc));
         return true;
  }
  
@@ -2828,24 +2899,28 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
         return &mm->page_table_lock;
  }
  static inline void ptlock_cache_init(void) {}
-static inline bool ptlock_init(struct page *page) { return true; }
-static inline void ptlock_free(struct page *page) {}
+static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void ptlock_free(struct ptdesc *ptdesc) {}
  #endif /* USE_SPLIT_PTE_PTLOCKS */
  
-static inline bool pgtable_pte_page_ctor(struct page *page)
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
  {
-       if (!ptlock_init(page))
+       struct folio *folio = ptdesc_folio(ptdesc);
+
+       if (!ptlock_init(ptdesc))
                 return false;
-       __SetPageTable(page);
-       inc_lruvec_page_state(page, NR_PAGETABLE);
+       __folio_set_pgtable(folio);
+       lruvec_stat_add_folio(folio, NR_PAGETABLE);
         return true;
  }
  
-static inline void pgtable_pte_page_dtor(struct page *page)
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
  {
-       ptlock_free(page);
-       __ClearPageTable(page);
-       dec_lruvec_page_state(page, NR_PAGETABLE);
+       struct folio *folio = ptdesc_folio(ptdesc);
+
+       ptlock_free(ptdesc);
+       __folio_clear_pgtable(folio);
+       lruvec_stat_sub_folio(folio, NR_PAGETABLE);
  }
  
  pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
@@ -2894,28 +2969,33 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd)
         return virt_to_page((void *)((unsigned long) pmd & mask));
  }
  
+static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
+{
+       return page_ptdesc(pmd_pgtable_page(pmd));
+}
+
  static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
  {
-       return ptlock_ptr(pmd_pgtable_page(pmd));
+       return ptlock_ptr(pmd_ptdesc(pmd));
  }
  
-static inline bool pmd_ptlock_init(struct page *page)
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
  {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       page->pmd_huge_pte = NULL;
+       ptdesc->pmd_huge_pte = NULL;
  #endif
-       return ptlock_init(page);
+       return ptlock_init(ptdesc);
  }
  
-static inline void pmd_ptlock_free(struct page *page)
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
  {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+       VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
  #endif
-       ptlock_free(page);
+       ptlock_free(ptdesc);
  }
  
-#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
  
  #else
  
@@ -2924,8 +3004,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
         return &mm->page_table_lock;
  }
  
-static inline bool pmd_ptlock_init(struct page *page) { return true; }
-static inline void pmd_ptlock_free(struct page *page) {}
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
  
  #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
  
@@ -2938,20 +3018,24 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
         return ptl;
  }
  
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
  {
-       if (!pmd_ptlock_init(page))
+       struct folio *folio = ptdesc_folio(ptdesc);
+
+       if (!pmd_ptlock_init(ptdesc))
                 return false;
-       __SetPageTable(page);
-       inc_lruvec_page_state(page, NR_PAGETABLE);
+       __folio_set_pgtable(folio);
+       lruvec_stat_add_folio(folio, NR_PAGETABLE);
         return true;
  }
  
-static inline void pgtable_pmd_page_dtor(struct page *page)
+static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
  {
-       pmd_ptlock_free(page);
-       __ClearPageTable(page);
-       dec_lruvec_page_state(page, NR_PAGETABLE);
+       struct folio *folio = ptdesc_folio(ptdesc);
+
+       pmd_ptlock_free(ptdesc);
+       __folio_clear_pgtable(folio);
+       lruvec_stat_sub_folio(folio, NR_PAGETABLE);
  }
  
  /*
@@ -3006,6 +3090,11 @@ static inline void mark_page_reserved(struct page *page)
         adjust_managed_page_count(page, -1);
  }
  
+static inline void free_reserved_ptdesc(struct ptdesc *pt)
+{
+       free_reserved_page(ptdesc_page(pt));
+}
+
  /*
   * Default method to free all the __init memory into the buddy system.
   * The freed pages will be poisoned with pattern "poison" if it's within
@@ -3071,9 +3160,9 @@ extern void mem_init(void);
  extern void __init mmap_init(void);
  
  extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+static inline void show_mem(void)
  {
-       __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+       __show_mem(0, NULL, MAX_NR_ZONES - 1);
  }
  extern long si_mem_available(void);
  extern void si_meminfo(struct sysinfo * val);
@@ -3417,6 +3506,24 @@ static inline vm_fault_t vmf_error(int err)
         return VM_FAULT_SIGBUS;
  }
  
+/*
+ * Convert errno to return value for ->page_mkwrite() calls.
+ *
+ * This should eventually be merged with vmf_error() above, but will need a
+ * careful audit of all vmf_error() callers.
+ */
+static inline vm_fault_t vmf_fs_error(int err)
+{
+       if (err == 0)
+               return VM_FAULT_LOCKED;
+       if (err == -EFAULT || err == -EAGAIN)
+               return VM_FAULT_NOPAGE;
+       if (err == -ENOMEM)
+               return VM_FAULT_OOM;
+       /* -ENOSPC, -EDQUOT, -EIO ... */
+       return VM_FAULT_SIGBUS;
+}
+
  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                          unsigned int foll_flags);
  
@@ -3435,15 +3542,24 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
   * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
   * a (NUMA hinting) fault is required.
   */
-static inline bool gup_can_follow_protnone(unsigned int flags)
+static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
+                                          unsigned int flags)
  {
         /*
-        * FOLL_FORCE has to be able to make progress even if the VMA is
-        * inaccessible. Further, FOLL_FORCE access usually does not represent
-        * application behaviour and we should avoid triggering NUMA hinting
-        * faults.
+        * If callers don't want to honor NUMA hinting faults, no need to
+        * determine if we would actually have to trigger a NUMA hinting fault.
          */
-       return flags & FOLL_FORCE;
+       if (!(flags & FOLL_HONOR_NUMA_FAULT))
+               return true;
+
+       /*
+        * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
+        *
+        * Requiring a fault here even for inaccessible VMAs would mean that
+        * FOLL_FORCE cannot make any progress, because handle_mm_fault()
+        * refuses to process NUMA hinting faults in inaccessible VMAs.
+        */
+       return !vma_is_accessible(vma);
  }
  
  typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
@@ -3514,8 +3630,8 @@ static inline bool debug_pagealloc_enabled(void)
  }
  
  /*
- * For use in fast paths after init_debug_pagealloc() has run, or when a
- * false negative result is not harmful when called too early.
+ * For use in fast paths after mem_debugging_and_hardening_init() has run,
+ * or when a false negative result is not harmful when called too early.
   */
  static inline bool debug_pagealloc_enabled_static(void)
  {
@@ -3670,13 +3786,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
                 struct vmem_altmap *altmap);
  #endif
  
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
-                                          struct dev_pagemap *pgmap)
+#define VMEMMAP_RESERVE_NR     2
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                         struct dev_pagemap *pgmap)
  {
-       return is_power_of_2(sizeof(struct page)) &&
-               pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+       unsigned long nr_pages;
+       unsigned long nr_vmemmap_pages;
+
+       if (!pgmap || !is_power_of_2(sizeof(struct page)))
+               return false;
+
+       nr_pages = pgmap_vmemmap_nr(pgmap);
+       nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
+       /*
+        * For vmemmap optimization with DAX we need minimum 2 vmemmap
+        * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
+        */
+       return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
  }
+/*
+ * If we don't have an architecture override, use the generic rule
+ */
+#ifndef vmemmap_can_optimize
+#define vmemmap_can_optimize __vmemmap_can_optimize
+#endif
+
  #else
  static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
                                            struct dev_pagemap *pgmap)