Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[sfrench/cifs-2.6.git] / mm / huge_memory.c
index 811d19b5c4f606f4bf6e31df751d376ee000aaef..abe6cfd92ffa0ecbb50a096742f8ca2073d45736 100644 (file)
@@ -1035,6 +1035,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
        unsigned long pfn = pmd_pfn(*pmd);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
+       int ret;
 
        assert_spin_locked(pmd_lockptr(mm, pmd));
 
@@ -1066,8 +1067,9 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
-       if (!try_grab_page(page, flags))
-               page = ERR_PTR(-ENOMEM);
+       ret = try_grab_page(page, flags);
+       if (ret)
+               page = ERR_PTR(ret);
 
        return page;
 }
@@ -1193,6 +1195,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
        unsigned long pfn = pud_pfn(*pud);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
+       int ret;
 
        assert_spin_locked(pud_lockptr(mm, pud));
 
@@ -1226,8 +1229,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
-       if (!try_grab_page(page, flags))
-               page = ERR_PTR(-ENOMEM);
+
+       ret = try_grab_page(page, flags);
+       if (ret)
+               page = ERR_PTR(ret);
 
        return page;
 }
@@ -1313,9 +1318,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
 
-       VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
-       VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
-
        if (is_huge_zero_pmd(orig_pmd))
                goto fallback;
 
@@ -1379,7 +1381,7 @@ reuse:
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                spin_unlock(vmf->ptl);
-               return VM_FAULT_WRITE;
+               return 0;
        }
 
 unlock_fallback:
@@ -1390,6 +1392,36 @@ fallback:
        return VM_FAULT_FALLBACK;
 }
 
+static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
+                                          unsigned long addr, pmd_t pmd)
+{
+       struct page *page;
+
+       if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
+               return false;
+
+       /* Don't touch entries that are not even readable (NUMA hinting). */
+       if (pmd_protnone(pmd))
+               return false;
+
+       /* Do we need write faults for softdirty tracking? */
+       if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+               return false;
+
+       /* Do we need write faults for uffd-wp tracking? */
+       if (userfaultfd_huge_pmd_wp(vma, pmd))
+               return false;
+
+       if (!(vma->vm_flags & VM_SHARED)) {
+               /* See can_change_pte_writable(). */
+               page = vm_normal_page_pmd(vma, addr, pmd);
+               return page && PageAnon(page) && PageAnonExclusive(page);
+       }
+
+       /* See can_change_pte_writable(). */
+       return pmd_dirty(pmd);
+}
+
 /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
 static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
                                        struct vm_area_struct *vma,
@@ -1435,6 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
+       int ret;
 
        assert_spin_locked(pmd_lockptr(mm, pmd));
 
@@ -1453,14 +1486,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
                return NULL;
 
-       if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
+       if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);
 
        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                        !PageAnonExclusive(page), page);
 
-       if (!try_grab_page(page, flags))
-               return ERR_PTR(-ENOMEM);
+       ret = try_grab_page(page, flags);
+       if (ret)
+               return ERR_PTR(ret);
 
        if (flags & FOLL_TOUCH)
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
@@ -1481,8 +1515,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int page_nid = NUMA_NO_NODE;
        int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
-       bool migrated = false;
-       bool was_writable = pmd_savedwrite(oldpmd);
+       bool migrated = false, writable = false;
        int flags = 0;
 
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1492,12 +1525,22 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
        }
 
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+
+       /*
+        * Detect now whether the PMD could be writable; this information
+        * is only valid while holding the PT lock.
+        */
+       writable = pmd_write(pmd);
+       if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+           can_change_pmd_writable(vma, vmf->address, pmd))
+               writable = true;
+
        page = vm_normal_page_pmd(vma, haddr, pmd);
        if (!page)
                goto out_map;
 
        /* See similar comment in do_numa_page for explanation */
-       if (!was_writable)
+       if (!writable)
                flags |= TNF_NO_GROUP;
 
        page_nid = page_to_nid(page);
@@ -1516,6 +1559,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
        }
 
        spin_unlock(vmf->ptl);
+       writable = false;
 
        migrated = migrate_misplaced_page(page, vma, target_nid);
        if (migrated) {
@@ -1542,7 +1586,7 @@ out_map:
        /* Restore the PMD */
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
        pmd = pmd_mkyoung(pmd);
-       if (was_writable)
+       if (writable)
                pmd = pmd_mkwrite(pmd);
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@ -1783,11 +1827,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
-       bool preserve_write;
-       int ret;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+       int ret = 1;
 
        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 
@@ -1798,9 +1841,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (!ptl)
                return 0;
 
-       preserve_write = prot_numa && pmd_write(*pmd);
-       ret = 1;
-
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (is_swap_pmd(*pmd)) {
                swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -1880,8 +1920,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
 
        entry = pmd_modify(oldpmd, newprot);
-       if (preserve_write)
-               entry = pmd_mk_savedwrite(entry);
        if (uffd_wp) {
                entry = pmd_wrprotect(entry);
                entry = pmd_mkuffd_wp(entry);
@@ -1893,13 +1931,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 */
                entry = pmd_clear_uffd_wp(entry);
        }
+
+       /* See change_pte_range(). */
+       if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
+           can_change_pmd_writable(vma, addr, entry))
+               entry = pmd_mkwrite(entry);
+
        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);
 
        if (huge_pmd_needs_flush(oldpmd, entry))
                tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
-
-       BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
 unlock:
        spin_unlock(ptl);
        return ret;
@@ -2141,7 +2183,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                uffd_wp = pmd_uffd_wp(old_pmd);
 
                VM_BUG_ON_PAGE(!page_count(page), page);
-               page_ref_add(page, HPAGE_PMD_NR - 1);
 
                /*
                 * Without "freeze", we'll simply split the PMD, propagating the
@@ -2161,6 +2202,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
                if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
                        freeze = false;
+               if (!freeze)
+                       page_ref_add(page, HPAGE_PMD_NR - 1);
        }
 
        /*
@@ -2202,66 +2245,37 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        entry = maybe_mkwrite(entry, vma);
                        if (anon_exclusive)
                                SetPageAnonExclusive(page + i);
-                       if (!write)
-                               entry = pte_wrprotect(entry);
                        if (!young)
                                entry = pte_mkold(entry);
+                       /* NOTE: this may set soft-dirty too on some archs */
+                       if (dirty)
+                               entry = pte_mkdirty(entry);
                        /*
-                        * NOTE: we don't do pte_mkdirty when dirty==true
-                        * because it breaks sparc64 which can sigsegv
-                        * random process.  Need to revisit when we figure
-                        * out what is special with sparc64.
+                        * NOTE: this needs to happen after pte_mkdirty,
+                        * because some archs (sparc64, loongarch) could
+                        * set hw write bit when mkdirty.
                         */
+                       if (!write)
+                               entry = pte_wrprotect(entry);
                        if (soft_dirty)
                                entry = pte_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_mkuffd_wp(entry);
+                       page_add_anon_rmap(page + i, vma, addr, false);
                }
                pte = pte_offset_map(&_pmd, addr);
                BUG_ON(!pte_none(*pte));
                set_pte_at(mm, addr, pte, entry);
-               if (!pmd_migration)
-                       atomic_inc(&page[i]._mapcount);
                pte_unmap(pte);
        }
 
-       if (!pmd_migration) {
-               /*
-                * Set PG_double_map before dropping compound_mapcount to avoid
-                * false-negative page_mapped().
-                */
-               if (compound_mapcount(page) > 1 &&
-                   !TestSetPageDoubleMap(page)) {
-                       for (i = 0; i < HPAGE_PMD_NR; i++)
-                               atomic_inc(&page[i]._mapcount);
-               }
-
-               lock_page_memcg(page);
-               if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
-                       /* Last compound_mapcount is gone. */
-                       __mod_lruvec_page_state(page, NR_ANON_THPS,
-                                               -HPAGE_PMD_NR);
-                       if (TestClearPageDoubleMap(page)) {
-                               /* No need in mapcount reference anymore */
-                               for (i = 0; i < HPAGE_PMD_NR; i++)
-                                       atomic_dec(&page[i]._mapcount);
-                       }
-               }
-               unlock_page_memcg(page);
-
-               /* Above is effectively page_remove_rmap(page, vma, true) */
-               munlock_vma_page(page, vma, true);
-       }
+       if (!pmd_migration)
+               page_remove_rmap(page, vma, true);
+       if (freeze)
+               put_page(page);
 
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-
-       if (freeze) {
-               for (i = 0; i < HPAGE_PMD_NR; i++) {
-                       page_remove_rmap(page + i, vma, false);
-                       put_page(page + i);
-               }
-       }
 }
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -2447,13 +2461,14 @@ static void __split_huge_page_tail(struct page *head, int tail,
                         (1L << PG_workingset) |
                         (1L << PG_locked) |
                         (1L << PG_unevictable) |
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
                         (1L << PG_arch_2) |
+                        (1L << PG_arch_3) |
 #endif
                         (1L << PG_dirty) |
                         LRU_GEN_MASK | LRU_REFS_MASK));
 
-       /* ->mapping in first tail page is compound_mapcount */
+       /* ->mapping in first and second tail page is replaced by other uses */
        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
                        page_tail);
        page_tail->mapping = head->mapping;
@@ -2463,6 +2478,10 @@ static void __split_huge_page_tail(struct page *head, int tail,
         * page->private should not be set in tail pages with the exception
         * of swap cache pages that store the swp_entry_t in tail pages.
         * Fix up and warn once if private is unexpectedly set.
+        *
+        * What of 32-bit systems, on which head[1].compound_pincount overlays
+        * head[1].private?  No problem: THP_SWAP is not enabled on 32-bit, and
+        * compound_pincount must be 0 for folio_ref_freeze() to have succeeded.
         */
        if (!folio_test_swapcache(page_folio(head))) {
                VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
@@ -2715,7 +2734,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         * split PMDs
         */
        if (!can_split_folio(folio, &extra_pins)) {
-               ret = -EBUSY;
+               ret = -EAGAIN;
                goto out_unlock;
        }
 
@@ -2765,7 +2784,7 @@ fail:
                        xas_unlock(&xas);
                local_irq_enable();
                remap_page(folio, folio_nr_pages(folio));
-               ret = -EBUSY;
+               ret = -EAGAIN;
        }
 
 out_unlock:
@@ -3069,28 +3088,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
        mapping = candidate->f_mapping;
 
        for (index = off_start; index < off_end; index += nr_pages) {
-               struct page *fpage = pagecache_get_page(mapping, index,
-                                               FGP_ENTRY | FGP_HEAD, 0);
+               struct folio *folio = __filemap_get_folio(mapping, index,
+                                               FGP_ENTRY, 0);
 
                nr_pages = 1;
-               if (xa_is_value(fpage) || !fpage)
+               if (xa_is_value(folio) || !folio)
                        continue;
 
-               if (!is_transparent_hugepage(fpage))
+               if (!folio_test_large(folio))
                        goto next;
 
                total++;
-               nr_pages = thp_nr_pages(fpage);
+               nr_pages = folio_nr_pages(folio);
 
-               if (!trylock_page(fpage))
+               if (!folio_trylock(folio))
                        goto next;
 
-               if (!split_huge_page(fpage))
+               if (!split_folio(folio))
                        split++;
 
-               unlock_page(fpage);
+               folio_unlock(folio);
 next:
-               put_page(fpage);
+               folio_put(folio);
                cond_resched();
        }