mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask

author Michal Hocko <mhocko@suse.com>

Fri, 2 Nov 2018 22:48:31 +0000 (15:48 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 3 Nov 2018 17:09:37 +0000 (10:09 -0700)
author Michal Hocko <mhocko@suse.com>
Fri, 2 Nov 2018 22:48:31 +0000 (15:48 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 3 Nov 2018 17:09:37 +0000 (10:09 -0700)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h

index 24bcc5eec6b409ec379602156a94d74373ec2633..76f8db0b0e715c016cc00cb95aa9a269f12c075d 100644 (file)
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
  }
  extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                         struct vm_area_struct *vma, unsigned long addr,
-                       int node, bool hugepage);
-#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
-       alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
+                       int node);
  #else
  #define alloc_pages(gfp_mask, order) \
                 alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
-       alloc_pages(gfp_mask, order)
-#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
         alloc_pages(gfp_mask, order)
  #endif
  #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
  #define alloc_page_vma(gfp_mask, vma, addr)                    \
-       alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
+       alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
  #define alloc_page_vma_node(gfp_mask, vma, addr, node)         \
-       alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
+       alloc_pages_vma(gfp_mask, 0, vma, addr, node)
  
  extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
  extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h

index 5228c62af41659bb7d5ae0e7db00969b9f16ef73..bac395f1d00a0f9691b12ec6841f2401a10ca4fc 100644 (file)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
  struct mempolicy *get_task_policy(struct task_struct *p);
  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                 unsigned long addr);
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+                                               unsigned long addr);
  bool vma_policy_mof(struct vm_area_struct *vma);
  
  extern void numa_default_policy(void);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 4e4ef8fa479d53b7ee7c4c8fcb86985acb790c8a..55478ab3c83be372f9fa4d654f16e32ffdeb1e29 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,21 +629,40 @@ release:
   *         available
   * never: never stall for any thp allocation
   */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
  {
         const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+       gfp_t this_node = 0;
+
+#ifdef CONFIG_NUMA
+       struct mempolicy *pol;
+       /*
+        * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
+        * specified, to express a general desire to stay on the current
+        * node for optimistic allocation attempts. If the defrag mode
+        * and/or madvise hint requires the direct reclaim then we prefer
+        * to fallback to other node rather than node reclaim because that
+        * can lead to excessive reclaim even though there is free memory
+        * on other nodes. We expect that NUMA preferences are specified
+        * by memory policies.
+        */
+       pol = get_vma_policy(vma, addr);
+       if (pol->mode != MPOL_BIND)
+               this_node = __GFP_THISNODE;
+       mpol_cond_put(pol);
+#endif
  
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            __GFP_KSWAPD_RECLAIM);
+                                                            __GFP_KSWAPD_RECLAIM | this_node);
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            0);
-       return GFP_TRANSHUGE_LIGHT;
+                                                            this_node);
+       return GFP_TRANSHUGE_LIGHT | this_node;
  }
  
  /* Caller must hold page table lock. */
@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                         pte_free(vma->vm_mm, pgtable);
                 return ret;
         }
-       gfp = alloc_hugepage_direct_gfpmask(vma);
-       page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+       gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
+       page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
         if (unlikely(!page)) {
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
  alloc:
         if (transparent_hugepage_enabled(vma) &&
             !transparent_hugepage_debug_cow()) {
-               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
-               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
+               huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
+               new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
+                               haddr, numa_node_id());
         } else
                 new_page = NULL;
  
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 58fb833fce0ce7a9d21e6729b044810225f687d6..5837a067124d895f38f6039d9e3739f0a0874fc0 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
         } else if (PageTransHuge(page)) {
                 struct page *thp;
  
-               thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
-                                        HPAGE_PMD_ORDER);
+               thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
+                               address, numa_node_id());
                 if (!thp)
                         return NULL;
                 prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
   * freeing by another task.  It is the caller's responsibility to free the
   * extra reference for shared policies.
   */
-static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                                 unsigned long addr)
  {
         struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
   *     @vma:  Pointer to VMA or NULL if not available.
   *     @addr: Virtual Address of the allocation. Must be inside the VMA.
   *     @node: Which node to prefer for allocation (modulo policy).
- *     @hugepage: for hugepages try only the preferred node if possible
   *
   *     This function allocates a page from the kernel page pool and applies
   *     a NUMA policy associated with the VMA or the current process.
@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
   */
  struct page *
  alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-               unsigned long addr, int node, bool hugepage)
+               unsigned long addr, int node)
  {
         struct mempolicy *pol;
         struct page *page;
@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                 goto out;
         }
  
-       if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
-               int hpage_node = node;
-
-               /*
-                * For hugepage allocation and non-interleave policy which
-                * allows the current node (or other explicitly preferred
-                * node) we only try to allocate from the current/preferred
-                * node and don't fall back to other nodes, as the cost of
-                * remote accesses would likely offset THP benefits.
-                *
-                * If the policy is interleave, or does not allow the current
-                * node in its nodemask, we allocate the standard way.
-                */
-               if (pol->mode == MPOL_PREFERRED &&
-                                               !(pol->flags & MPOL_F_LOCAL))
-                       hpage_node = pol->v.preferred_node;
-
-               nmask = policy_nodemask(gfp, pol);
-               if (!nmask || node_isset(hpage_node, *nmask)) {
-                       mpol_cond_put(pol);
-                       /*
-                        * We cannot invoke reclaim if __GFP_THISNODE
-                        * is set. Invoking reclaim with
-                        * __GFP_THISNODE set, would cause THP
-                        * allocations to trigger heavy swapping
-                        * despite there may be tons of free memory
-                        * (including potentially plenty of THP
-                        * already available in the buddy) on all the
-                        * other NUMA nodes.
-                        *
-                        * At most we could invoke compaction when
-                        * __GFP_THISNODE is set (but we would need to
-                        * refrain from invoking reclaim even if
-                        * compaction returned COMPACT_SKIPPED because
-                        * there wasn't not enough memory to succeed
-                        * compaction). For now just avoid
-                        * __GFP_THISNODE instead of limiting the
-                        * allocation path to a strict and single
-                        * compaction invocation.
-                        *
-                        * Supposedly if direct reclaim was enabled by
-                        * the caller, the app prefers THP regardless
-                        * of the node it comes from so this would be
-                        * more desiderable behavior than only
-                        * providing THP originated from the local
-                        * node in such case.
-                        */
-                       if (!(gfp & __GFP_DIRECT_RECLAIM))
-                               gfp |= __GFP_THISNODE;
-                       page = __alloc_pages_node(hpage_node, gfp, order);
-                       goto out;
-               }
-       }
-
         nmask = policy_nodemask(gfp, pol);
         preferred_nid = policy_node(gfp, pol, node);
         page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/shmem.c b/mm/shmem.c

index 56bf122e0bb4ddf7b57548e7e4b4a33bbdf9a9ab..ea26d7a0342d77ac67f47e813a73f125c873a1e5 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
  
         shmem_pseudo_vma_init(&pvma, info, hindex);
         page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-                       HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+                       HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
         shmem_pseudo_vma_destroy(&pvma);
         if (page)
                 prep_transhuge_page(page);
author	Michal Hocko <mhocko@suse.com>
	Fri, 2 Nov 2018 22:48:31 +0000 (15:48 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 3 Nov 2018 17:09:37 +0000 (10:09 -0700)
include/linux/gfp.h		patch \| blob \| history
include/linux/mempolicy.h		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history