hugetlb: batch PMD split for bulk vmemmap dedup
authorJoao Martins <joao.m.martins@oracle.com>
Thu, 19 Oct 2023 02:31:08 +0000 (19:31 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 25 Oct 2023 23:47:07 +0000 (16:47 -0700)
In an effort to minimize amount of TLB flushes, batch all PMD splits
belonging to a range of pages in order to perform only 1 (global) TLB
flush.

Add a flags field to the walker and pass whether it's a bulk allocation or
just a single page to decide to remap.  First value
(VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
flush when we split the PMD.

Rebased and updated by Mike Kravetz

Link: https://lkml.kernel.org/r/20231019023113.345257-7-mike.kravetz@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Konrad Dybcio <konradybcio@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Usama Arif <usama.arif@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/hugetlb_vmemmap.c

index 4ac521e596db7d42b51e841f702a6b3a52b01898..10739e4285d5e7d2579a56a46c2374b8b11f5785 100644 (file)
@@ -27,6 +27,8 @@
  * @reuse_addr:                the virtual address of the @reuse_page page.
  * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
  *                     or is mapped from.
+ * @flags:             used to modify behavior in vmemmap page table walking
+ *                     operations.
  */
 struct vmemmap_remap_walk {
        void                    (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -35,9 +37,13 @@ struct vmemmap_remap_walk {
        struct page             *reuse_page;
        unsigned long           reuse_addr;
        struct list_head        *vmemmap_pages;
+
+/* Skip the TLB flush when we split the PMD */
+#define VMEMMAP_SPLIT_NO_TLB_FLUSH     BIT(0)
+       unsigned long           flags;
 };
 
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
 {
        pmd_t __pmd;
        int i;
@@ -80,7 +86,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
                /* Make pte visible before pmd. See comment in pmd_install(). */
                smp_wmb();
                pmd_populate_kernel(&init_mm, pmd, pgtable);
-               flush_tlb_kernel_range(start, start + PMD_SIZE);
+               if (flush)
+                       flush_tlb_kernel_range(start, start + PMD_SIZE);
        } else {
                pte_free_kernel(&init_mm, pgtable);
        }
@@ -127,11 +134,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
        do {
                int ret;
 
-               ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+               ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
+                               !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
                if (ret)
                        return ret;
 
                next = pmd_addr_end(addr, end);
+
+               /*
+                * We are only splitting, not remapping the hugetlb vmemmap
+                * pages.
+                */
+               if (!walk->remap_pte)
+                       continue;
+
                vmemmap_pte_range(pmd, addr, next, walk);
        } while (pmd++, addr = next, addr != end);
 
@@ -198,7 +214,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
                        return ret;
        } while (pgd++, addr = next, addr != end);
 
-       flush_tlb_kernel_range(start, end);
+       if (walk->remap_pte)
+               flush_tlb_kernel_range(start, end);
 
        return 0;
 }
@@ -297,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
        set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 }
 
+/**
+ * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
+ *                      backing PMDs of the directmap into PTEs
+ * @start:     start address of the vmemmap virtual address range that we want
+ *             to remap.
+ * @end:       end address of the vmemmap virtual address range that we want to
+ *             remap.
+ * @reuse:     reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_split(unsigned long start, unsigned long end,
+                               unsigned long reuse)
+{
+       int ret;
+       struct vmemmap_remap_walk walk = {
+               .remap_pte      = NULL,
+               .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
+       };
+
+       /* See the comment in the vmemmap_remap_free(). */
+       BUG_ON(start - reuse != PAGE_SIZE);
+
+       mmap_read_lock(&init_mm);
+       ret = vmemmap_remap_range(reuse, end, &walk);
+       mmap_read_unlock(&init_mm);
+
+       return ret;
+}
+
 /**
  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  *                     to the page which @reuse is mapped to, then free vmemmap
@@ -320,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
                .remap_pte      = vmemmap_remap_pte,
                .reuse_addr     = reuse,
                .vmemmap_pages  = vmemmap_pages,
+               .flags          = 0,
        };
        int nid = page_to_nid((struct page *)reuse);
        gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
@@ -368,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
                        .remap_pte      = vmemmap_restore_pte,
                        .reuse_addr     = reuse,
                        .vmemmap_pages  = vmemmap_pages,
+                       .flags          = 0,
                };
 
                vmemmap_remap_range(reuse, end, &walk);
@@ -419,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
                .remap_pte      = vmemmap_restore_pte,
                .reuse_addr     = reuse,
                .vmemmap_pages  = &vmemmap_pages,
+               .flags          = 0,
        };
 
        /* See the comment in the vmemmap_remap_free(). */
@@ -628,11 +678,45 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
        free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
+{
+       unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+       unsigned long vmemmap_reuse;
+
+       if (!vmemmap_should_optimize(h, head))
+               return 0;
+
+       vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
+       vmemmap_reuse   = vmemmap_start;
+       vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+       /*
+        * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
+        * @vmemmap_end]
+        */
+       return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
+}
+
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 {
        struct folio *folio;
        LIST_HEAD(vmemmap_pages);
 
+       list_for_each_entry(folio, folio_list, lru) {
+               int ret = hugetlb_vmemmap_split(h, &folio->page);
+
+               /*
+                * Spliting the PMD requires allocating a page, thus lets fail
+                * early once we encounter the first OOM. No point in retrying
+                * as it can be dynamically done on remap with the memory
+                * we get back from the vmemmap deduplication.
+                */
+               if (ret == -ENOMEM)
+                       break;
+       }
+
+       flush_tlb_all();
+
        list_for_each_entry(folio, folio_list, lru) {
                int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
                                                                &vmemmap_pages);