Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[sfrench/cifs-2.6.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 8bbcccbc55654341c4d2b361549e1da4da6d65cb..f3f196e4d66d6f42c74366731890a5d4102bf75b 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count)
  
  extern void * high_memory;
  extern int page_cluster;
+extern const int page_cluster_max;
  
  #ifdef CONFIG_SYSCTL
  extern int sysctl_legacy_va_layout;
@@ -549,7 +550,7 @@ struct vm_operations_struct {
         /*
          * Called by mprotect() to make driver-specific permission
          * checks before mprotect() is finalised.   The VMA must not
-        * be modified.  Returns 0 if eprotect() can proceed.
+        * be modified.  Returns 0 if mprotect() can proceed.
          */
         int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                         unsigned long end, unsigned long newflags);
@@ -699,8 +700,10 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
   * paths in userfault.
   */
  bool vma_is_shmem(struct vm_area_struct *vma);
+bool vma_is_anon_shmem(struct vm_area_struct *vma);
  #else
  static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
  #endif
  
  int vma_is_stack_for_current(struct vm_area_struct *vma);
@@ -817,8 +820,8 @@ static inline int is_vmalloc_or_module_addr(const void *x)
  /*
   * How many times the entire folio is mapped as a single unit (eg by a
   * PMD or PUD entry).  This is probably not what you want, except for
- * debugging purposes; look at folio_mapcount() or page_mapcount()
- * instead.
+ * debugging purposes - it does not include PTE-mapped sub-pages; look
+ * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
   */
  static inline int folio_entire_mapcount(struct folio *folio)
  {
@@ -828,12 +831,29 @@ static inline int folio_entire_mapcount(struct folio *folio)
  
  /*
   * Mapcount of compound page as a whole, does not include mapped sub-pages.
- *
- * Must be called only for compound pages.
+ * Must be called only on head of compound page.
   */
-static inline int compound_mapcount(struct page *page)
+static inline int head_compound_mapcount(struct page *head)
  {
-       return folio_entire_mapcount(page_folio(page));
+       return atomic_read(compound_mapcount_ptr(head)) + 1;
+}
+
+/*
+ * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages,
+ * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit
+ * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
+ * leaves subpages_mapcount at 0, but avoid surprise if it participates later.
+ */
+#define COMPOUND_MAPPED        0x800000
+#define SUBPAGES_MAPPED        (COMPOUND_MAPPED - 1)
+
+/*
+ * Number of sub-pages mapped by PTE, does not include compound mapcount.
+ * Must be called only on head of compound page.
+ */
+static inline int head_subpages_mapcount(struct page *head)
+{
+       return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED;
  }
  
  /*
@@ -846,11 +866,9 @@ static inline void page_mapcount_reset(struct page *page)
         atomic_set(&(page)->_mapcount, -1);
  }
  
-int __page_mapcount(struct page *page);
-
  /*
   * Mapcount of 0-order page; when compound sub-page, includes
- * compound_mapcount().
+ * compound_mapcount of compound_head of page.
   *
   * Result is undefined for pages which cannot be mapped into userspace.
   * For example SLAB or special types of pages. See function page_has_type().
@@ -858,25 +876,75 @@ int __page_mapcount(struct page *page);
   */
  static inline int page_mapcount(struct page *page)
  {
-       if (unlikely(PageCompound(page)))
-               return __page_mapcount(page);
-       return atomic_read(&page->_mapcount) + 1;
+       int mapcount = atomic_read(&page->_mapcount) + 1;
+
+       if (likely(!PageCompound(page)))
+               return mapcount;
+       page = compound_head(page);
+       return head_compound_mapcount(page) + mapcount;
  }
  
-int folio_mapcount(struct folio *folio);
+int total_compound_mapcount(struct page *head);
  
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int total_mapcount(struct page *page)
+/**
+ * folio_mapcount() - Calculate the number of mappings of this folio.
+ * @folio: The folio.
+ *
+ * A large folio tracks both how many times the entire folio is mapped,
+ * and how many times each individual page in the folio is mapped.
+ * This function calculates the total number of times the folio is
+ * mapped.
+ *
+ * Return: The number of times this folio is mapped.
+ */
+static inline int folio_mapcount(struct folio *folio)
  {
-       return folio_mapcount(page_folio(page));
+       if (likely(!folio_test_large(folio)))
+               return atomic_read(&folio->_mapcount) + 1;
+       return total_compound_mapcount(&folio->page);
  }
  
-#else
  static inline int total_mapcount(struct page *page)
  {
-       return page_mapcount(page);
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) + 1;
+       return total_compound_mapcount(compound_head(page));
+}
+
+static inline bool folio_large_is_mapped(struct folio *folio)
+{
+       /*
+        * Reading folio_mapcount_ptr() below could be omitted if hugetlb
+        * participated in incrementing subpages_mapcount when compound mapped.
+        */
+       return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 ||
+               atomic_read(folio_mapcount_ptr(folio)) >= 0;
+}
+
+/**
+ * folio_mapped - Is this folio mapped into userspace?
+ * @folio: The folio.
+ *
+ * Return: True if any page in this folio is referenced by user page tables.
+ */
+static inline bool folio_mapped(struct folio *folio)
+{
+       if (likely(!folio_test_large(folio)))
+               return atomic_read(&folio->_mapcount) >= 0;
+       return folio_large_is_mapped(folio);
+}
+
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any sub-page of compound page is mapped,
+ * even if this particular sub-page is not itself mapped by any PTE or PMD.
+ */
+static inline bool page_mapped(struct page *page)
+{
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) >= 0;
+       return folio_large_is_mapped(page_folio(page));
  }
-#endif
  
  static inline struct page *virt_to_head_page(const void *x)
  {
@@ -929,6 +997,13 @@ static inline void set_compound_page_dtor(struct page *page,
         page[1].compound_dtor = compound_dtor;
  }
  
+static inline void folio_set_compound_dtor(struct folio *folio,
+               enum compound_dtor_id compound_dtor)
+{
+       VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
+       folio->_folio_dtor = compound_dtor;
+}
+
  void destroy_large_folio(struct folio *folio);
  
  static inline int head_compound_pincount(struct page *head)
@@ -944,6 +1019,22 @@ static inline void set_compound_order(struct page *page, unsigned int order)
  #endif
  }
  
+/*
+ * folio_set_compound_order is generally passed a non-zero order to
+ * initialize a large folio.  However, hugetlb code abuses this by
+ * passing in zero when 'dissolving' a large folio.
+ */
+static inline void folio_set_compound_order(struct folio *folio,
+               unsigned int order)
+{
+       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+
+       folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+       folio->_folio_nr_pages = order ? 1U << order : 0;
+#endif
+}
+
  /* Returns the number of pages in this potentially compound page. */
  static inline unsigned long compound_nr(struct page *page)
  {
@@ -1129,7 +1220,7 @@ static inline void get_page(struct page *page)
         folio_get(page_folio(page));
  }
  
-bool __must_check try_grab_page(struct page *page, unsigned int flags);
+int __must_check try_grab_page(struct page *page, unsigned int flags);
  
  static inline __must_check bool try_get_page(struct page *page)
  {
@@ -1179,7 +1270,24 @@ static inline void folio_put_refs(struct folio *folio, int refs)
                 __folio_put(folio);
  }
  
-void release_pages(struct page **pages, int nr);
+/**
+ * release_pages - release an array of pages or folios
+ *
+ * This just releases a simple array of multiple pages, and
+ * accepts various different forms of said page array: either
+ * a regular old boring array of pages, an array of folios, or
+ * an array of encoded page pointers.
+ *
+ * The transparent union syntax for this kind of "any of these
+ * argument types" is all kinds of ugly, so look away.
+ */
+typedef union {
+       struct page **pages;
+       struct folio **folios;
+       struct encoded_page **encoded_pages;
+} release_pages_arg __attribute__ ((__transparent_union__));
+
+void release_pages(release_pages_arg, int nr);
  
  /**
   * folios_put - Decrement the reference count on an array of folios.
@@ -1195,7 +1303,7 @@ void release_pages(struct page **pages, int nr);
   */
  static inline void folios_put(struct folio **folios, unsigned int nr)
  {
-       release_pages((struct page **)folios, nr);
+       release_pages(folios, nr);
  }
  
  static inline void put_page(struct page *page)
@@ -1799,9 +1907,6 @@ static inline pgoff_t page_index(struct page *page)
         return page->index;
  }
  
-bool page_mapped(struct page *page);
-bool folio_mapped(struct folio *folio);
-
  /*
   * Return true only if the page has been allocated with
   * ALLOC_NO_WATERMARKS and the low watermark was not
@@ -1852,6 +1957,25 @@ static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodem
         __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
  }
  
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+       struct folio *single_folio;     /* Locked folio to be unmapped */
+       bool even_cows;                 /* Zap COWed private pages too? */
+       zap_flags_t zap_flags;          /* Extra flags for zapping */
+};
+
+/*
+ * Whether to drop the pte markers, for example, the uffd-wp information for
+ * file-backed memory.  This should only be specified when we will completely
+ * drop the page in the mm, either by truncation or unmapping of the vma.  By
+ * default, the flag is not set.
+ */
+#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
+#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
+
  #ifdef CONFIG_MMU
  extern bool can_do_mlock(void);
  #else
@@ -1869,6 +1993,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                   unsigned long size);
  void zap_page_range(struct vm_area_struct *vma, unsigned long address,
                     unsigned long size);
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+                          unsigned long size, struct zap_details *details);
  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                 struct vm_area_struct *start_vma, unsigned long start,
                 unsigned long end);
@@ -2004,6 +2130,22 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                             MM_CP_UFFD_WP_RESOLVE)
  
+int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
+{
+       /*
+        * We want to check manually if we can change individual PTEs writable
+        * if we can't do that automatically for all PTEs in a mapping. For
+        * private mappings, that's always the case when we have write
+        * permissions as we properly have to handle COW.
+        */
+       if (vma->vm_flags & VM_SHARED)
+               return vma_wants_writenotify(vma, vma->vm_page_prot);
+       return !!(vma->vm_flags & VM_WRITE);
+
+}
+bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t pte);
  extern unsigned long change_protection(struct mmu_gather *tlb,
                               struct vm_area_struct *vma, unsigned long start,
                               unsigned long end, pgprot_t newprot,
@@ -2030,40 +2172,30 @@ static inline bool get_user_page_fast_only(unsigned long addr,
   */
  static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
  {
-       long val = atomic_long_read(&mm->rss_stat.count[member]);
-
-#ifdef SPLIT_RSS_COUNTING
-       /*
-        * counter is updated in asynchronous manner and may go to minus.
-        * But it's never be expected number for users.
-        */
-       if (val < 0)
-               val = 0;
-#endif
-       return (unsigned long)val;
+       return percpu_counter_read_positive(&mm->rss_stat[member]);
  }
  
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
+void mm_trace_rss_stat(struct mm_struct *mm, int member);
  
  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
  {
-       long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+       percpu_counter_add(&mm->rss_stat[member], value);
  
-       mm_trace_rss_stat(mm, member, count);
+       mm_trace_rss_stat(mm, member);
  }
  
  static inline void inc_mm_counter(struct mm_struct *mm, int member)
  {
-       long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+       percpu_counter_inc(&mm->rss_stat[member]);
  
-       mm_trace_rss_stat(mm, member, count);
+       mm_trace_rss_stat(mm, member);
  }
  
  static inline void dec_mm_counter(struct mm_struct *mm, int member)
  {
-       long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+       percpu_counter_dec(&mm->rss_stat[member]);
  
-       mm_trace_rss_stat(mm, member, count);
+       mm_trace_rss_stat(mm, member);
  }
  
  /* Optimized variant when page is already known not to be PageAnon */
@@ -2153,8 +2285,6 @@ static inline int pte_devmap(pte_t pte)
  }
  #endif
  
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-
  extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                spinlock_t **ptl);
  static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -2403,7 +2533,7 @@ static inline void pgtable_pte_page_dtor(struct page *page)
  
  #if USE_SPLIT_PMD_PTLOCKS
  
-static struct page *pmd_to_page(pmd_t *pmd)
+static inline struct page *pmd_pgtable_page(pmd_t *pmd)
  {
         unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
         return virt_to_page((void *)((unsigned long) pmd & mask));
@@ -2411,7 +2541,7 @@ static struct page *pmd_to_page(pmd_t *pmd)
  
  static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
  {
-       return ptlock_ptr(pmd_to_page(pmd));
+       return ptlock_ptr(pmd_pgtable_page(pmd));
  }
  
  static inline bool pmd_ptlock_init(struct page *page)
@@ -2430,7 +2560,7 @@ static inline void pmd_ptlock_free(struct page *page)
         ptlock_free(page);
  }
  
-#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
  
  #else
  
@@ -2950,7 +3080,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                                  * and return without waiting upon it */
  #define FOLL_NOFAULT   0x80    /* do not fault in pages */
  #define FOLL_HWPOISON  0x100   /* check page is hwpoisoned */
-#define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
  #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
  #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
  #define FOLL_ANON      0x8000  /* don't do file mappings */
@@ -2958,6 +3087,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
  #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
  #define FOLL_PIN       0x40000 /* pages must be released via unpin_user_page */
  #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_PCI_P2PDMA        0x100000 /* allow returning PCI P2PDMA pages */
+#define FOLL_INTERRUPTIBLE  0x200000 /* allow interrupts from generic signals */
  
  /*
   * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
@@ -3042,8 +3173,12 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
   * Must be called with the (sub)page that's actually referenced via the
   * page table entry, which might not necessarily be the head page for a
   * PTE-mapped THP.
+ *
+ * If the vma is NULL, we're coming from the GUP-fast path and might have
+ * to fallback to the slow path just to lookup the vma.
   */
-static inline bool gup_must_unshare(unsigned int flags, struct page *page)
+static inline bool gup_must_unshare(struct vm_area_struct *vma,
+                                   unsigned int flags, struct page *page)
  {
         /*
          * FOLL_WRITE is implicitly handled correctly as the page table entry
@@ -3056,8 +3191,25 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
          * Note: PageAnon(page) is stable until the page is actually getting
          * freed.
          */
-       if (!PageAnon(page))
-               return false;
+       if (!PageAnon(page)) {
+               /*
+                * We only care about R/O long-term pining: R/O short-term
+                * pinning does not have the semantics to observe successive
+                * changes through the process page tables.
+                */
+               if (!(flags & FOLL_LONGTERM))
+                       return false;
+
+               /* We really need the vma ... */
+               if (!vma)
+                       return true;
+
+               /*
+                * ... because we only care about writable private ("COW")
+                * mappings where we have to break COW early.
+                */
+               return is_cow_mapping(vma->vm_flags);
+       }
  
         /* Paired with a memory barrier in page_try_share_anon_rmap(). */
         if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
@@ -3233,6 +3385,8 @@ void *sparse_buffer_alloc(unsigned long size);
  struct page * __populate_section_memmap(unsigned long pfn,
                 unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                 struct dev_pagemap *pgmap);
+void pmd_init(void *addr);
+void pud_init(void *addr);
  pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
  p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
  pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
@@ -3244,8 +3398,14 @@ struct vmem_altmap;
  void *vmemmap_alloc_block_buf(unsigned long size, int node,
                               struct vmem_altmap *altmap);
  void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
+void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+                    unsigned long addr, unsigned long next);
+int vmemmap_check_pmd(pmd_t *pmd, int node,
+                     unsigned long addr, unsigned long next);
  int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                                int node, struct vmem_altmap *altmap);
+int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+                              int node, struct vmem_altmap *altmap);
  int vmemmap_populate(unsigned long start, unsigned long end, int node,
                 struct vmem_altmap *altmap);
  void vmemmap_populate_print_last(void);
@@ -3268,7 +3428,6 @@ enum mf_flags {
  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                       unsigned long count, int mf_flags);
  extern int memory_failure(unsigned long pfn, int flags);
-extern void memory_failure_queue(unsigned long pfn, int flags);
  extern void memory_failure_queue_kick(int cpu);
  extern int unpoison_memory(unsigned long pfn);
  extern int sysctl_memory_failure_early_kill;
@@ -3277,12 +3436,42 @@ extern void shake_page(struct page *p);
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
  #ifdef CONFIG_MEMORY_FAILURE
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+extern void memory_failure_queue(unsigned long pfn, int flags);
+extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+                                       bool *migratable_cleared);
+void num_poisoned_pages_inc(unsigned long pfn);
+void num_poisoned_pages_sub(unsigned long pfn, long i);
  #else
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+static inline void memory_failure_queue(unsigned long pfn, int flags)
+{
+}
+
+static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+                                       bool *migratable_cleared)
  {
         return 0;
  }
+
+static inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+}
+
+static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+}
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+extern void memblk_nr_poison_inc(unsigned long pfn);
+extern void memblk_nr_poison_sub(unsigned long pfn, long i);
+#else
+static inline void memblk_nr_poison_inc(unsigned long pfn)
+{
+}
+
+static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+}
  #endif
  
  #ifndef arch_memory_failure
@@ -3467,12 +3656,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  }
  #endif
  
-/*
- * Whether to drop the pte markers, for example, the uffd-wp information for
- * file-backed memory.  This should only be specified when we will completely
- * drop the page in the mm, either by truncation or unmapping of the vma.  By
- * default, the flag is not set.
- */
-#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
-
  #endif /* _LINUX_MM_H */