Merge tag 'pm-part2-4.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...

[sfrench/cifs-2.6.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 9a334f5fb730873190a57648bc0f040f91ac0ed6..7c204e3d132b808364fc1e87e3dace180f9ba173 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,10 +34,9 @@
  #include <linux/hugetlb_cgroup.h>
  #include <linux/node.h>
  #include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
  #include "internal.h"
  
-int hugepages_treat_as_movable;
-
  int hugetlb_max_hstate __read_mostly;
  unsigned int default_hstate_idx;
  struct hstate hstates[HUGE_MAX_HSTATE];
@@ -926,7 +925,7 @@ retry_cpuset:
  /* Movability of hugepages depends on migration support. */
  static inline gfp_t htlb_alloc_mask(struct hstate *h)
  {
-       if (hugepages_treat_as_movable || hugepage_migration_supported(h))
+       if (hugepage_migration_supported(h))
                 return GFP_HIGHUSER_MOVABLE;
         else
                 return GFP_HIGHUSER;
@@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
         return zone_spans_pfn(zone, last_pfn);
  }
  
-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nodemask)
  {
         unsigned int order = huge_page_order(h);
         unsigned long nr_pages = 1 << order;
@@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
         struct zonelist *zonelist;
         struct zone *zone;
         struct zoneref *z;
-       gfp_t gfp_mask;
  
-       gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
         zonelist = node_zonelist(nid, gfp_mask);
-       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
                 spin_lock_irqsave(&zone->lock, flags);
  
                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
@@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
  static void prep_compound_gigantic_page(struct page *page, unsigned int order);
  
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
-       struct page *page;
-
-       page = alloc_gigantic_page(nid, h);
-       if (page) {
-               prep_compound_gigantic_page(page, huge_page_order(h));
-               prep_new_huge_page(h, page, nid);
-       }
-
-       return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
-                               nodemask_t *nodes_allowed)
-{
-       struct page *page = NULL;
-       int nr_nodes, node;
-
-       for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-               page = alloc_fresh_gigantic_page_node(h, node);
-               if (page)
-                       return 1;
-       }
-
-       return 0;
-}
-
  #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
  static inline bool gigantic_page_supported(void) { return false; }
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nodemask) { return NULL; }
  static inline void free_gigantic_page(struct page *page, unsigned int order) { }
  static inline void destroy_compound_gigantic_page(struct page *page,
                                                 unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
-                                       nodemask_t *nodes_allowed) { return 0; }
  #endif
  
  static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
         ClearPagePrivate(&page[1]);
  }
  
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+       if (!PageHuge(page))
+               return false;
+
+       return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = NULL;
+}
+
  void free_huge_page(struct page *page)
  {
         /*
@@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page)
         if (restore_reserve)
                 h->resv_huge_pages++;
  
-       if (h->surplus_huge_pages_node[nid]) {
+       if (PageHugeTemporary(page)) {
+               list_del(&page->lru);
+               ClearPageHugeTemporary(page);
+               update_and_free_page(h, page);
+       } else if (h->surplus_huge_pages_node[nid]) {
                 /* remove the page from active list */
                 list_del(&page->lru);
                 update_and_free_page(h, page);
@@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         h->nr_huge_pages++;
         h->nr_huge_pages_node[nid]++;
         spin_unlock(&hugetlb_lock);
-       put_page(page); /* free it into the hugepage allocator */
  }
  
  static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page)
         return (index << compound_order(page_head)) + compound_idx;
  }
  
-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
  {
+       int order = huge_page_order(h);
         struct page *page;
  
-       page = __alloc_pages_node(nid,
-               htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-                                               __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
-               huge_page_order(h));
-       if (page) {
-               prep_new_huge_page(h, page, nid);
-       }
+       gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+       if (nid == NUMA_NO_NODE)
+               nid = numa_mem_id();
+       page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+       if (page)
+               __count_vm_event(HTLB_BUDDY_PGALLOC);
+       else
+               __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+       return page;
+}
+
+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+       else
+               page = alloc_buddy_huge_page(h, gfp_mask,
+                               nid, nmask);
+       if (!page)
+               return NULL;
+
+       if (hstate_is_gigantic(h))
+               prep_compound_gigantic_page(page, huge_page_order(h));
+       prep_new_huge_page(h, page, page_to_nid(page));
  
         return page;
  }
  
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
  {
         struct page *page;
         int nr_nodes, node;
-       int ret = 0;
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
  
         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-               page = alloc_fresh_huge_page_node(h, node);
-               if (page) {
-                       ret = 1;
+               page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+               if (page)
                         break;
-               }
         }
  
-       if (ret)
-               count_vm_event(HTLB_BUDDY_PGALLOC);
-       else
-               count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+       if (!page)
+               return 0;
  
-       return ret;
+       put_page(page); /* free it into the hugepage allocator */
+
+       return 1;
  }
  
  /*
@@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
         return rc;
  }
  
-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
-               gfp_t gfp_mask, int nid, nodemask_t *nmask)
-{
-       int order = huge_page_order(h);
-
-       gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
-       if (nid == NUMA_NO_NODE)
-               nid = numa_mem_id();
-       return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
-}
-
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                 int nid, nodemask_t *nmask)
  {
-       struct page *page;
-       unsigned int r_nid;
+       struct page *page = NULL;
  
         if (hstate_is_gigantic(h))
                 return NULL;
  
+       spin_lock(&hugetlb_lock);
+       if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+               goto out_unlock;
+       spin_unlock(&hugetlb_lock);
+
+       page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       spin_lock(&hugetlb_lock);
         /*
-        * Assume we will successfully allocate the surplus page to
-        * prevent racing processes from causing the surplus to exceed
-        * overcommit
-        *
-        * This however introduces a different race, where a process B
-        * tries to grow the static hugepage pool while alloc_pages() is
-        * called by process A. B will only examine the per-node
-        * counters in determining if surplus huge pages can be
-        * converted to normal huge pages in adjust_pool_surplus(). A
-        * won't be able to increment the per-node counter, until the
-        * lock is dropped by B, but B doesn't drop hugetlb_lock until
-        * no more huge pages can be converted from surplus to normal
-        * state (and doesn't try to convert again). Thus, we have a
-        * case where a surplus huge page exists, the pool is grown, and
-        * the surplus huge page still exists after, even though it
-        * should just have been converted to a normal huge page. This
-        * does not leak memory, though, as the hugepage will be freed
-        * once it is out of use. It also does not allow the counters to
-        * go out of whack in adjust_pool_surplus() as we don't modify
-        * the node values until we've gotten the hugepage and only the
-        * per-node value is checked there.
+        * We could have raced with the pool size change.
+        * Double check that and simply deallocate the new page
+        * if we would end up overcommiting the surpluses. Abuse
+        * temporary page to workaround the nasty free_huge_page
+        * codeflow
          */
-       spin_lock(&hugetlb_lock);
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
-               spin_unlock(&hugetlb_lock);
-               return NULL;
+               SetPageHugeTemporary(page);
+               put_page(page);
+               page = NULL;
         } else {
-               h->nr_huge_pages++;
                 h->surplus_huge_pages++;
+               h->nr_huge_pages_node[page_to_nid(page)]++;
         }
+
+out_unlock:
         spin_unlock(&hugetlb_lock);
  
-       page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+       return page;
+}
  
-       spin_lock(&hugetlb_lock);
-       if (page) {
-               INIT_LIST_HEAD(&page->lru);
-               r_nid = page_to_nid(page);
-               set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-               set_hugetlb_cgroup(page, NULL);
-               /*
-                * We incremented the global counters already
-                */
-               h->nr_huge_pages_node[r_nid]++;
-               h->surplus_huge_pages_node[r_nid]++;
-               __count_vm_event(HTLB_BUDDY_PGALLOC);
-       } else {
-               h->nr_huge_pages--;
-               h->surplus_huge_pages--;
-               __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-       }
-       spin_unlock(&hugetlb_lock);
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               return NULL;
+
+       page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       /*
+        * We do not account these pages as surplus because they are only
+        * temporary and will be released properly on the last reference
+        */
+       SetPageHugeTemporary(page);
  
         return page;
  }
@@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
   * Use the VMA's mpolicy to allocate a huge page from the buddy.
   */
  static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                 struct vm_area_struct *vma, unsigned long addr)
  {
         struct page *page;
@@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
         nodemask_t *nodemask;
  
         nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-       page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+       page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
         mpol_cond_put(mpol);
  
         return page;
  }
  
-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
  struct page *alloc_huge_page_node(struct hstate *h, int nid)
  {
         gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
         spin_unlock(&hugetlb_lock);
  
         if (!page)
-               page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+               page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
  
         return page;
  }
  
-
+/* page migration callback function */
  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                 nodemask_t *nmask)
  {
@@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
         }
         spin_unlock(&hugetlb_lock);
  
-       /* No reservations, try to overcommit */
+       return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+               unsigned long address)
+{
+       struct mempolicy *mpol;
+       nodemask_t *nodemask;
+       struct page *page;
+       gfp_t gfp_mask;
+       int node;
+
+       gfp_mask = htlb_alloc_mask(h);
+       node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+       page = alloc_huge_page_nodemask(h, node, nodemask);
+       mpol_cond_put(mpol);
  
-       return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+       return page;
  }
  
  /*
@@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
  retry:
         spin_unlock(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
-               page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+               page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                 NUMA_NO_NODE, NULL);
                 if (!page) {
                         alloc_ok = false;
@@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
         if (!page) {
                 spin_unlock(&hugetlb_lock);
-               page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+               page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
                 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2074,20 +2097,6 @@ out_subpool_put:
         return ERR_PTR(-ENOSPC);
  }
  
-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
-                               unsigned long addr, int avoid_reserve)
-{
-       struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
-       if (IS_ERR(page))
-               page = NULL;
-       return page;
-}
-
  int alloc_bootmem_huge_page(struct hstate *h)
         __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
  int __alloc_bootmem_huge_page(struct hstate *h)
@@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void)
                 prep_compound_huge_page(page, h->order);
                 WARN_ON(PageReserved(page));
                 prep_new_huge_page(h, page, page_to_nid(page));
+               put_page(page); /* free it into the hugepage allocator */
+
                 /*
                  * If we had gigantic hugepages allocated at boot time, we need
                  * to restore the 'stolen' pages to totalram_pages in order to
@@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                 if (hstate_is_gigantic(h)) {
                         if (!alloc_bootmem_huge_page(h))
                                 break;
-               } else if (!alloc_fresh_huge_page(h,
+               } else if (!alloc_pool_huge_page(h,
                                          &node_states[N_MEMORY]))
                         break;
                 cond_resched();
@@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * First take pages out of surplus state.  Then make up the
          * remaining difference by allocating fresh huge pages.
          *
-        * We might race with __alloc_buddy_huge_page() here and be unable
+        * We might race with alloc_surplus_huge_page() here and be unable
          * to convert a surplus huge page to a normal huge page. That is
          * not critical, though, it just means the overall size of the
          * pool might be one hugepage larger than it needs to be, but
@@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                 /* yield cpu to avoid soft lockup */
                 cond_resched();
  
-               if (hstate_is_gigantic(h))
-                       ret = alloc_fresh_gigantic_page(h, nodes_allowed);
-               else
-                       ret = alloc_fresh_huge_page(h, nodes_allowed);
+               ret = alloc_pool_huge_page(h, nodes_allowed);
                 spin_lock(&hugetlb_lock);
                 if (!ret)
                         goto out;
@@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * By placing pages into the surplus state independent of the
          * overcommit value, we are allowing the surplus pool size to
          * exceed overcommit. There are few sane options here. Since
-        * __alloc_buddy_huge_page() is checking the global counter,
+        * alloc_surplus_huge_page() is checking the global counter,
          * though, we'll note that we're not allowed to exceed surplus
          * and won't grow the pool anywhere else. Not until one of the
          * sysctls are changed, or the surplus pages go out of use.
@@ -2975,20 +2983,32 @@ out:
  
  void hugetlb_report_meminfo(struct seq_file *m)
  {
-       struct hstate *h = &default_hstate;
+       struct hstate *h;
+       unsigned long total = 0;
+
         if (!hugepages_supported())
                 return;
-       seq_printf(m,
-                       "HugePages_Total:   %5lu\n"
-                       "HugePages_Free:    %5lu\n"
-                       "HugePages_Rsvd:    %5lu\n"
-                       "HugePages_Surp:    %5lu\n"
-                       "Hugepagesize:   %8lu kB\n",
-                       h->nr_huge_pages,
-                       h->free_huge_pages,
-                       h->resv_huge_pages,
-                       h->surplus_huge_pages,
-                       1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+
+       for_each_hstate(h) {
+               unsigned long count = h->nr_huge_pages;
+
+               total += (PAGE_SIZE << huge_page_order(h)) * count;
+
+               if (h == &default_hstate)
+                       seq_printf(m,
+                                  "HugePages_Total:   %5lu\n"
+                                  "HugePages_Free:    %5lu\n"
+                                  "HugePages_Rsvd:    %5lu\n"
+                                  "HugePages_Surp:    %5lu\n"
+                                  "Hugepagesize:   %8lu kB\n",
+                                  count,
+                                  h->free_huge_pages,
+                                  h->resv_huge_pages,
+                                  h->surplus_huge_pages,
+                                  (PAGE_SIZE << huge_page_order(h)) / 1024);
+       }
+
+       seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
  }
  
  int hugetlb_report_node_meminfo(int nid, char *buf)
@@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page)
         spin_unlock(&hugetlb_lock);
         put_page(page);
  }
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+       struct hstate *h = page_hstate(oldpage);
+
+       hugetlb_cgroup_migrate(oldpage, newpage);
+       set_page_owner_migrate_reason(newpage, reason);
+
+       /*
+        * transfer temporary state of the new huge page. This is
+        * reverse to other transitions because the newpage is going to
+        * be final while the old one will be freed so it takes over
+        * the temporary status.
+        *
+        * Also note that we have to transfer the per-node surplus state
+        * here as well otherwise the global surplus count will not match
+        * the per-node's.
+        */
+       if (PageHugeTemporary(newpage)) {
+               int old_nid = page_to_nid(oldpage);
+               int new_nid = page_to_nid(newpage);
+
+               SetPageHugeTemporary(oldpage);
+               ClearPageHugeTemporary(newpage);
+
+               spin_lock(&hugetlb_lock);
+               if (h->surplus_huge_pages_node[old_nid]) {
+                       h->surplus_huge_pages_node[old_nid]--;
+                       h->surplus_huge_pages_node[new_nid]++;
+               }
+               spin_unlock(&hugetlb_lock);
+       }
+}