Merge tag 'pm-part2-4.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[sfrench/cifs-2.6.git] / mm / hugetlb.c
index 9a334f5fb730873190a57648bc0f040f91ac0ed6..7c204e3d132b808364fc1e87e3dace180f9ba173 100644 (file)
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
 #include "internal.h"
 
-int hugepages_treat_as_movable;
-
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
@@ -926,7 +925,7 @@ retry_cpuset:
 /* Movability of hugepages depends on migration support. */
 static inline gfp_t htlb_alloc_mask(struct hstate *h)
 {
-       if (hugepages_treat_as_movable || hugepage_migration_supported(h))
+       if (hugepage_migration_supported(h))
                return GFP_HIGHUSER_MOVABLE;
        else
                return GFP_HIGHUSER;
@@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
        return zone_spans_pfn(zone, last_pfn);
 }
 
-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nodemask)
 {
        unsigned int order = huge_page_order(h);
        unsigned long nr_pages = 1 << order;
@@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
-       gfp_t gfp_mask;
 
-       gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
        zonelist = node_zonelist(nid, gfp_mask);
-       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
                spin_lock_irqsave(&zone->lock, flags);
 
                pfn = ALIGN(zone->zone_start_pfn, nr_pages);
@@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
 static void prep_compound_gigantic_page(struct page *page, unsigned int order);
 
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
-       struct page *page;
-
-       page = alloc_gigantic_page(nid, h);
-       if (page) {
-               prep_compound_gigantic_page(page, huge_page_order(h));
-               prep_new_huge_page(h, page, nid);
-       }
-
-       return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
-                               nodemask_t *nodes_allowed)
-{
-       struct page *page = NULL;
-       int nr_nodes, node;
-
-       for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-               page = alloc_fresh_gigantic_page_node(h, node);
-               if (page)
-                       return 1;
-       }
-
-       return 0;
-}
-
 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
 static inline bool gigantic_page_supported(void) { return false; }
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nodemask) { return NULL; }
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
-                                       nodemask_t *nodes_allowed) { return 0; }
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
        ClearPagePrivate(&page[1]);
 }
 
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+       if (!PageHuge(page))
+               return false;
+
+       return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = NULL;
+}
+
 void free_huge_page(struct page *page)
 {
        /*
@@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page)
        if (restore_reserve)
                h->resv_huge_pages++;
 
-       if (h->surplus_huge_pages_node[nid]) {
+       if (PageHugeTemporary(page)) {
+               list_del(&page->lru);
+               ClearPageHugeTemporary(page);
+               update_and_free_page(h, page);
+       } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                list_del(&page->lru);
                update_and_free_page(h, page);
@@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        spin_unlock(&hugetlb_lock);
-       put_page(page); /* free it into the hugepage allocator */
 }
 
 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page)
        return (index << compound_order(page_head)) + compound_idx;
 }
 
-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
 {
+       int order = huge_page_order(h);
        struct page *page;
 
-       page = __alloc_pages_node(nid,
-               htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-                                               __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
-               huge_page_order(h));
-       if (page) {
-               prep_new_huge_page(h, page, nid);
-       }
+       gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+       if (nid == NUMA_NO_NODE)
+               nid = numa_mem_id();
+       page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+       if (page)
+               __count_vm_event(HTLB_BUDDY_PGALLOC);
+       else
+               __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+       return page;
+}
+
+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+       else
+               page = alloc_buddy_huge_page(h, gfp_mask,
+                               nid, nmask);
+       if (!page)
+               return NULL;
+
+       if (hstate_is_gigantic(h))
+               prep_compound_gigantic_page(page, huge_page_order(h));
+       prep_new_huge_page(h, page, page_to_nid(page));
 
        return page;
 }
 
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
 {
        struct page *page;
        int nr_nodes, node;
-       int ret = 0;
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 
        for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-               page = alloc_fresh_huge_page_node(h, node);
-               if (page) {
-                       ret = 1;
+               page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+               if (page)
                        break;
-               }
        }
 
-       if (ret)
-               count_vm_event(HTLB_BUDDY_PGALLOC);
-       else
-               count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+       if (!page)
+               return 0;
 
-       return ret;
+       put_page(page); /* free it into the hugepage allocator */
+
+       return 1;
 }
 
 /*
@@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
        return rc;
 }
 
-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
-               gfp_t gfp_mask, int nid, nodemask_t *nmask)
-{
-       int order = huge_page_order(h);
-
-       gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
-       if (nid == NUMA_NO_NODE)
-               nid = numa_mem_id();
-       return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
-}
-
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nmask)
 {
-       struct page *page;
-       unsigned int r_nid;
+       struct page *page = NULL;
 
        if (hstate_is_gigantic(h))
                return NULL;
 
+       spin_lock(&hugetlb_lock);
+       if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+               goto out_unlock;
+       spin_unlock(&hugetlb_lock);
+
+       page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       spin_lock(&hugetlb_lock);
        /*
-        * Assume we will successfully allocate the surplus page to
-        * prevent racing processes from causing the surplus to exceed
-        * overcommit
-        *
-        * This however introduces a different race, where a process B
-        * tries to grow the static hugepage pool while alloc_pages() is
-        * called by process A. B will only examine the per-node
-        * counters in determining if surplus huge pages can be
-        * converted to normal huge pages in adjust_pool_surplus(). A
-        * won't be able to increment the per-node counter, until the
-        * lock is dropped by B, but B doesn't drop hugetlb_lock until
-        * no more huge pages can be converted from surplus to normal
-        * state (and doesn't try to convert again). Thus, we have a
-        * case where a surplus huge page exists, the pool is grown, and
-        * the surplus huge page still exists after, even though it
-        * should just have been converted to a normal huge page. This
-        * does not leak memory, though, as the hugepage will be freed
-        * once it is out of use. It also does not allow the counters to
-        * go out of whack in adjust_pool_surplus() as we don't modify
-        * the node values until we've gotten the hugepage and only the
-        * per-node value is checked there.
+        * We could have raced with the pool size change.
+        * Double check that and simply deallocate the new page
+        * if we would end up overcommiting the surpluses. Abuse
+        * temporary page to workaround the nasty free_huge_page
+        * codeflow
         */
-       spin_lock(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
-               spin_unlock(&hugetlb_lock);
-               return NULL;
+               SetPageHugeTemporary(page);
+               put_page(page);
+               page = NULL;
        } else {
-               h->nr_huge_pages++;
                h->surplus_huge_pages++;
+               h->nr_huge_pages_node[page_to_nid(page)]++;
        }
+
+out_unlock:
        spin_unlock(&hugetlb_lock);
 
-       page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+       return page;
+}
 
-       spin_lock(&hugetlb_lock);
-       if (page) {
-               INIT_LIST_HEAD(&page->lru);
-               r_nid = page_to_nid(page);
-               set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-               set_hugetlb_cgroup(page, NULL);
-               /*
-                * We incremented the global counters already
-                */
-               h->nr_huge_pages_node[r_nid]++;
-               h->surplus_huge_pages_node[r_nid]++;
-               __count_vm_event(HTLB_BUDDY_PGALLOC);
-       } else {
-               h->nr_huge_pages--;
-               h->surplus_huge_pages--;
-               __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-       }
-       spin_unlock(&hugetlb_lock);
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               return NULL;
+
+       page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       /*
+        * We do not account these pages as surplus because they are only
+        * temporary and will be released properly on the last reference
+        */
+       SetPageHugeTemporary(page);
 
        return page;
 }
@@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
  * Use the VMA's mpolicy to allocate a huge page from the buddy.
  */
 static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
@@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
        nodemask_t *nodemask;
 
        nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-       page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+       page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
        mpol_cond_put(mpol);
 
        return page;
 }
 
-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
 struct page *alloc_huge_page_node(struct hstate *h, int nid)
 {
        gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
        spin_unlock(&hugetlb_lock);
 
        if (!page)
-               page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+               page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
 
        return page;
 }
 
-
+/* page migration callback function */
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask)
 {
@@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
        }
        spin_unlock(&hugetlb_lock);
 
-       /* No reservations, try to overcommit */
+       return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+               unsigned long address)
+{
+       struct mempolicy *mpol;
+       nodemask_t *nodemask;
+       struct page *page;
+       gfp_t gfp_mask;
+       int node;
+
+       gfp_mask = htlb_alloc_mask(h);
+       node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+       page = alloc_huge_page_nodemask(h, node, nodemask);
+       mpol_cond_put(mpol);
 
-       return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+       return page;
 }
 
 /*
@@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-               page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+               page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                NUMA_NO_NODE, NULL);
                if (!page) {
                        alloc_ok = false;
@@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
                spin_unlock(&hugetlb_lock);
-               page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+               page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                if (!page)
                        goto out_uncharge_cgroup;
                if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2074,20 +2097,6 @@ out_subpool_put:
        return ERR_PTR(-ENOSPC);
 }
 
-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
-                               unsigned long addr, int avoid_reserve)
-{
-       struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
-       if (IS_ERR(page))
-               page = NULL;
-       return page;
-}
-
 int alloc_bootmem_huge_page(struct hstate *h)
        __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
 int __alloc_bootmem_huge_page(struct hstate *h)
@@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void)
                prep_compound_huge_page(page, h->order);
                WARN_ON(PageReserved(page));
                prep_new_huge_page(h, page, page_to_nid(page));
+               put_page(page); /* free it into the hugepage allocator */
+
                /*
                 * If we had gigantic hugepages allocated at boot time, we need
                 * to restore the 'stolen' pages to totalram_pages in order to
@@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                if (hstate_is_gigantic(h)) {
                        if (!alloc_bootmem_huge_page(h))
                                break;
-               } else if (!alloc_fresh_huge_page(h,
+               } else if (!alloc_pool_huge_page(h,
                                         &node_states[N_MEMORY]))
                        break;
                cond_resched();
@@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
         * First take pages out of surplus state.  Then make up the
         * remaining difference by allocating fresh huge pages.
         *
-        * We might race with __alloc_buddy_huge_page() here and be unable
+        * We might race with alloc_surplus_huge_page() here and be unable
         * to convert a surplus huge page to a normal huge page. That is
         * not critical, though, it just means the overall size of the
         * pool might be one hugepage larger than it needs to be, but
@@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                /* yield cpu to avoid soft lockup */
                cond_resched();
 
-               if (hstate_is_gigantic(h))
-                       ret = alloc_fresh_gigantic_page(h, nodes_allowed);
-               else
-                       ret = alloc_fresh_huge_page(h, nodes_allowed);
+               ret = alloc_pool_huge_page(h, nodes_allowed);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
@@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
         * By placing pages into the surplus state independent of the
         * overcommit value, we are allowing the surplus pool size to
         * exceed overcommit. There are few sane options here. Since
-        * __alloc_buddy_huge_page() is checking the global counter,
+        * alloc_surplus_huge_page() is checking the global counter,
         * though, we'll note that we're not allowed to exceed surplus
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
@@ -2975,20 +2983,32 @@ out:
 
 void hugetlb_report_meminfo(struct seq_file *m)
 {
-       struct hstate *h = &default_hstate;
+       struct hstate *h;
+       unsigned long total = 0;
+
        if (!hugepages_supported())
                return;
-       seq_printf(m,
-                       "HugePages_Total:   %5lu\n"
-                       "HugePages_Free:    %5lu\n"
-                       "HugePages_Rsvd:    %5lu\n"
-                       "HugePages_Surp:    %5lu\n"
-                       "Hugepagesize:   %8lu kB\n",
-                       h->nr_huge_pages,
-                       h->free_huge_pages,
-                       h->resv_huge_pages,
-                       h->surplus_huge_pages,
-                       1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+
+       for_each_hstate(h) {
+               unsigned long count = h->nr_huge_pages;
+
+               total += (PAGE_SIZE << huge_page_order(h)) * count;
+
+               if (h == &default_hstate)
+                       seq_printf(m,
+                                  "HugePages_Total:   %5lu\n"
+                                  "HugePages_Free:    %5lu\n"
+                                  "HugePages_Rsvd:    %5lu\n"
+                                  "HugePages_Surp:    %5lu\n"
+                                  "Hugepagesize:   %8lu kB\n",
+                                  count,
+                                  h->free_huge_pages,
+                                  h->resv_huge_pages,
+                                  h->surplus_huge_pages,
+                                  (PAGE_SIZE << huge_page_order(h)) / 1024);
+       }
+
+       seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
 }
 
 int hugetlb_report_node_meminfo(int nid, char *buf)
@@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page)
        spin_unlock(&hugetlb_lock);
        put_page(page);
 }
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+       struct hstate *h = page_hstate(oldpage);
+
+       hugetlb_cgroup_migrate(oldpage, newpage);
+       set_page_owner_migrate_reason(newpage, reason);
+
+       /*
+        * transfer temporary state of the new huge page. This is
+        * reverse to other transitions because the newpage is going to
+        * be final while the old one will be freed so it takes over
+        * the temporary status.
+        *
+        * Also note that we have to transfer the per-node surplus state
+        * here as well otherwise the global surplus count will not match
+        * the per-node's.
+        */
+       if (PageHugeTemporary(newpage)) {
+               int old_nid = page_to_nid(oldpage);
+               int new_nid = page_to_nid(newpage);
+
+               SetPageHugeTemporary(oldpage);
+               ClearPageHugeTemporary(newpage);
+
+               spin_lock(&hugetlb_lock);
+               if (h->surplus_huge_pages_node[old_nid]) {
+                       h->surplus_huge_pages_node[old_nid]--;
+                       h->surplus_huge_pages_node[new_nid]++;
+               }
+               spin_unlock(&hugetlb_lock);
+       }
+}