Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / mm / page_alloc.c
index 1741dd23e7c1f7d4878cf38ff9ba021928f99267..905db9d7962fcb1776c0e7ffb1618fb6e4084a75 100644 (file)
@@ -46,7 +46,6 @@
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
-#include <xen/xen.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
@@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order);
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
-        256,
+       [ZONE_DMA] = 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
-        256,
+       [ZONE_DMA32] = 256,
 #endif
+       [ZONE_NORMAL] = 32,
 #ifdef CONFIG_HIGHMEM
-        32,
+       [ZONE_HIGHMEM] = 0,
 #endif
-        32,
+       [ZONE_MOVABLE] = 0,
 };
 
 EXPORT_SYMBOL(totalram_pages);
@@ -265,17 +265,19 @@ int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
 int watermark_scale_factor = 10;
 
-static unsigned long __meminitdata nr_kernel_pages;
-static unsigned long __meminitdata nr_all_pages;
-static unsigned long __meminitdata dma_reserve;
+static unsigned long nr_kernel_pages __meminitdata;
+static unsigned long nr_all_pages __meminitdata;
+static unsigned long dma_reserve __meminitdata;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-static unsigned long __initdata required_kernelcore;
-static unsigned long __initdata required_movablecore;
-static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-static bool mirrored_kernelcore;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static bool mirrored_kernelcore __meminitdata;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -292,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
-/*
- * Determine how many pages need to be initialized during early boot
- * (non-deferred initialization).
- * The value of first_deferred_pfn will be set later, once non-deferred pages
- * are initialized, but for now set it ULONG_MAX.
- */
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
-       phys_addr_t start_addr, end_addr;
-       unsigned long max_pgcnt;
-       unsigned long reserved;
-
-       /*
-        * Initialise at least 2G of a node but also take into account that
-        * two large system hashes that can take up 1GB for 0.25TB/node.
-        */
-       max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
-                       (pgdat->node_spanned_pages >> 8));
-
-       /*
-        * Compensate the all the memblock reservations (e.g. crash kernel)
-        * from the initial estimation to make sure we will initialize enough
-        * memory to boot.
-        */
-       start_addr = PFN_PHYS(pgdat->node_start_pfn);
-       end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
-       reserved = memblock_reserved_memory_within(start_addr, end_addr);
-       max_pgcnt += PHYS_PFN(reserved);
-
-       pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
-       pgdat->first_deferred_pfn = ULONG_MAX;
-}
-
 /* Returns true if the struct page for the pfn is uninitialised */
 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 {
@@ -348,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
        /* Always populate low zones for address-constrained allocations */
        if (zone_end < pgdat_end_pfn(pgdat))
                return true;
-       /* Xen PV domains need page structures early */
-       if (xen_pv_domain())
-               return true;
        (*nr_initialised)++;
        if ((*nr_initialised > pgdat->static_init_pgcnt) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
@@ -361,10 +326,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
        return true;
 }
 #else
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
-}
-
 static inline bool early_page_uninitialised(unsigned long pfn)
 {
        return false;
@@ -1099,6 +1060,15 @@ static bool bulkfree_pcp_prepare(struct page *page)
 }
 #endif /* CONFIG_DEBUG_VM */
 
+static inline void prefetch_buddy(struct page *page)
+{
+       unsigned long pfn = page_to_pfn(page);
+       unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
+       struct page *buddy = page + (buddy_pfn - pfn);
+
+       prefetch(buddy);
+}
+
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
@@ -1115,13 +1085,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
        int migratetype = 0;
        int batch_free = 0;
+       int prefetch_nr = 0;
        bool isolated_pageblocks;
-
-       spin_lock(&zone->lock);
-       isolated_pageblocks = has_isolate_pageblock(zone);
+       struct page *page, *tmp;
+       LIST_HEAD(head);
 
        while (count) {
-               struct page *page;
                struct list_head *list;
 
                /*
@@ -1143,26 +1112,48 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        batch_free = count;
 
                do {
-                       int mt; /* migratetype of the to-be-freed page */
-
                        page = list_last_entry(list, struct page, lru);
-                       /* must delete as __free_one_page list manipulates */
+                       /* must delete to avoid corrupting pcp list */
                        list_del(&page->lru);
-
-                       mt = get_pcppage_migratetype(page);
-                       /* MIGRATE_ISOLATE page should not go to pcplists */
-                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
-                       /* Pageblock could have been isolated meanwhile */
-                       if (unlikely(isolated_pageblocks))
-                               mt = get_pageblock_migratetype(page);
+                       pcp->count--;
 
                        if (bulkfree_pcp_prepare(page))
                                continue;
 
-                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
-                       trace_mm_page_pcpu_drain(page, 0, mt);
+                       list_add_tail(&page->lru, &head);
+
+                       /*
+                        * We are going to put the page back to the global
+                        * pool, prefetch its buddy to speed up later access
+                        * under zone->lock. It is believed the overhead of
+                        * an additional test and calculating buddy_pfn here
+                        * can be offset by reduced memory latency later. To
+                        * avoid excessive prefetching due to large count, only
+                        * prefetch buddy for the first pcp->batch nr of pages.
+                        */
+                       if (prefetch_nr++ < pcp->batch)
+                               prefetch_buddy(page);
                } while (--count && --batch_free && !list_empty(list));
        }
+
+       spin_lock(&zone->lock);
+       isolated_pageblocks = has_isolate_pageblock(zone);
+
+       /*
+        * Use safe version since after __free_one_page(),
+        * page->lru.next will not point to original list.
+        */
+       list_for_each_entry_safe(page, tmp, &head, lru) {
+               int mt = get_pcppage_migratetype(page);
+               /* MIGRATE_ISOLATE page should not go to pcplists */
+               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+               /* Pageblock could have been isolated meanwhile */
+               if (unlikely(isolated_pageblocks))
+                       mt = get_pageblock_migratetype(page);
+
+               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+               trace_mm_page_pcpu_drain(page, 0, mt);
+       }
        spin_unlock(&zone->lock);
 }
 
@@ -1181,10 +1172,9 @@ static void free_one_page(struct zone *zone,
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-                               unsigned long zone, int nid, bool zero)
+                               unsigned long zone, int nid)
 {
-       if (zero)
-               mm_zero_struct_page(page);
+       mm_zero_struct_page(page);
        set_page_links(page, zone, nid, pfn);
        init_page_count(page);
        page_mapcount_reset(page);
@@ -1198,12 +1188,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 #endif
 }
 
-static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
-                                       int nid, bool zero)
-{
-       return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
-}
-
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __meminit init_reserved_page(unsigned long pfn)
 {
@@ -1222,7 +1206,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
                if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
                        break;
        }
-       __init_single_pfn(pfn, zid, nid, true);
+       __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
 }
 #else
 static inline void init_reserved_page(unsigned long pfn)
@@ -1506,7 +1490,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
                } else if (!(pfn & nr_pgmask)) {
                        deferred_free_range(pfn - nr_free, nr_free);
                        nr_free = 1;
-                       cond_resched();
+                       touch_nmi_watchdog();
                } else {
                        nr_free++;
                }
@@ -1535,11 +1519,11 @@ static unsigned long  __init deferred_init_pages(int nid, int zid,
                        continue;
                } else if (!page || !(pfn & nr_pgmask)) {
                        page = pfn_to_page(pfn);
-                       cond_resched();
+                       touch_nmi_watchdog();
                } else {
                        page++;
                }
-               __init_single_page(page, pfn, zid, nid, true);
+               __init_single_page(page, pfn, zid, nid);
                nr_pages++;
        }
        return (nr_pages);
@@ -1552,23 +1536,25 @@ static int __init deferred_init_memmap(void *data)
        int nid = pgdat->node_id;
        unsigned long start = jiffies;
        unsigned long nr_pages = 0;
-       unsigned long spfn, epfn;
+       unsigned long spfn, epfn, first_init_pfn, flags;
        phys_addr_t spa, epa;
        int zid;
        struct zone *zone;
-       unsigned long first_init_pfn = pgdat->first_deferred_pfn;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
        u64 i;
 
+       /* Bind memory initialisation thread to a local node if possible */
+       if (!cpumask_empty(cpumask))
+               set_cpus_allowed_ptr(current, cpumask);
+
+       pgdat_resize_lock(pgdat, &flags);
+       first_init_pfn = pgdat->first_deferred_pfn;
        if (first_init_pfn == ULONG_MAX) {
+               pgdat_resize_unlock(pgdat, &flags);
                pgdat_init_report_one_done();
                return 0;
        }
 
-       /* Bind memory initialisation thread to a local node if possible */
-       if (!cpumask_empty(cpumask))
-               set_cpus_allowed_ptr(current, cpumask);
-
        /* Sanity check boundaries */
        BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
        BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
@@ -1598,6 +1584,7 @@ static int __init deferred_init_memmap(void *data)
                epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
                deferred_free_pages(nid, zid, spfn, epfn);
        }
+       pgdat_resize_unlock(pgdat, &flags);
 
        /* Sanity check that the next zone really is unpopulated */
        WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1608,6 +1595,117 @@ static int __init deferred_init_memmap(void *data)
        pgdat_init_report_one_done();
        return 0;
 }
+
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+static noinline bool __init
+deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+       int zid = zone_idx(zone);
+       int nid = zone_to_nid(zone);
+       pg_data_t *pgdat = NODE_DATA(nid);
+       unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+       unsigned long nr_pages = 0;
+       unsigned long first_init_pfn, spfn, epfn, t, flags;
+       unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+       phys_addr_t spa, epa;
+       u64 i;
+
+       /* Only the last zone may have deferred pages */
+       if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+               return false;
+
+       pgdat_resize_lock(pgdat, &flags);
+
+       /*
+        * If deferred pages have been initialized while we were waiting for
+        * the lock, return true, as the zone was grown.  The caller will retry
+        * this zone.  We won't return to this function since the caller also
+        * has this static branch.
+        */
+       if (!static_branch_unlikely(&deferred_pages)) {
+               pgdat_resize_unlock(pgdat, &flags);
+               return true;
+       }
+
+       /*
+        * If someone grew this zone while we were waiting for spinlock, return
+        * true, as there might be enough pages already.
+        */
+       if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+               pgdat_resize_unlock(pgdat, &flags);
+               return true;
+       }
+
+       first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
+
+       if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+               pgdat_resize_unlock(pgdat, &flags);
+               return false;
+       }
+
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+
+               while (spfn < epfn && nr_pages < nr_pages_needed) {
+                       t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
+                       first_deferred_pfn = min(t, epfn);
+                       nr_pages += deferred_init_pages(nid, zid, spfn,
+                                                       first_deferred_pfn);
+                       spfn = first_deferred_pfn;
+               }
+
+               if (nr_pages >= nr_pages_needed)
+                       break;
+       }
+
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+               epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
+               deferred_free_pages(nid, zid, spfn, epfn);
+
+               if (first_deferred_pfn == epfn)
+                       break;
+       }
+       pgdat->first_deferred_pfn = first_deferred_pfn;
+       pgdat_resize_unlock(pgdat, &flags);
+
+       return nr_pages > 0;
+}
+
+/*
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
+ */
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+       return deferred_grow_zone(zone, order);
+}
+
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 void __init page_alloc_init_late(void)
@@ -1626,6 +1724,12 @@ void __init page_alloc_init_late(void)
        /* Block until all are initialised */
        wait_for_completion(&pgdat_init_all_done_comp);
 
+       /*
+        * We initialized the rest of the deferred pages.  Permanently disable
+        * on-demand struct page initialization.
+        */
+       static_branch_disable(&deferred_pages);
+
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
 #endif
@@ -1639,16 +1743,38 @@ void __init page_alloc_init_late(void)
 }
 
 #ifdef CONFIG_CMA
+static void __init adjust_present_page_count(struct page *page, long count)
+{
+       struct zone *zone = page_zone(page);
+
+       /* We don't need to hold a lock since it is boot-up process */
+       zone->present_pages += count;
+}
+
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
        unsigned i = pageblock_nr_pages;
+       unsigned long pfn = page_to_pfn(page);
        struct page *p = page;
+       int nid = page_to_nid(page);
+
+       /*
+        * ZONE_MOVABLE will steal present pages from other zones by
+        * changing page links so page_zone() is changed. Before that,
+        * we need to adjust previous zone's page count first.
+        */
+       adjust_present_page_count(page, -pageblock_nr_pages);
 
        do {
                __ClearPageReserved(p);
                set_page_count(p, 0);
-       } while (++p, --i);
+
+               /* Steal pages from other zones */
+               set_page_links(p, ZONE_MOVABLE, nid, pfn);
+       } while (++p, ++pfn, --i);
+
+       adjust_present_page_count(page, pageblock_nr_pages);
 
        set_pageblock_migratetype(page, MIGRATE_CMA);
 
@@ -2418,10 +2544,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        local_irq_save(flags);
        batch = READ_ONCE(pcp->batch);
        to_drain = min(pcp->count, batch);
-       if (to_drain > 0) {
+       if (to_drain > 0)
                free_pcppages_bulk(zone, to_drain, pcp);
-               pcp->count -= to_drain;
-       }
        local_irq_restore(flags);
 }
 #endif
@@ -2443,10 +2567,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
        pset = per_cpu_ptr(zone->pageset, cpu);
 
        pcp = &pset->pcp;
-       if (pcp->count) {
+       if (pcp->count)
                free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
-       }
        local_irq_restore(flags);
 }
 
@@ -2670,7 +2792,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
        if (pcp->count >= pcp->high) {
                unsigned long batch = READ_ONCE(pcp->batch);
                free_pcppages_bulk(zone, batch, pcp);
-               pcp->count -= batch;
        }
 }
 
@@ -2768,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
                 * exists.
                 */
                watermark = min_wmark_pages(zone) + (1UL << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
                        return 0;
 
                __mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3044,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
        }
 
 
-#ifdef CONFIG_CMA
-       /* If allocation can't use CMA areas don't use free CMA pages */
-       if (!(alloc_flags & ALLOC_CMA))
-               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
        /*
         * Check watermarks for an order-0 allocation request. If these
         * are not met, then a high-order request also cannot go ahead
@@ -3076,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                }
 
 #ifdef CONFIG_CMA
-               if ((alloc_flags & ALLOC_CMA) &&
-                   !list_empty(&area->free_list[MIGRATE_CMA])) {
+               if (!list_empty(&area->free_list[MIGRATE_CMA]))
                        return true;
-               }
 #endif
                if (alloc_harder &&
                        !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3099,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
                unsigned long mark, int classzone_idx, unsigned int alloc_flags)
 {
        long free_pages = zone_page_state(z, NR_FREE_PAGES);
-       long cma_pages = 0;
-
-#ifdef CONFIG_CMA
-       /* If allocation can't use CMA areas don't use free CMA pages */
-       if (!(alloc_flags & ALLOC_CMA))
-               cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
 
        /*
         * Fast check for order-0 only. If this fails then the reserves
@@ -3114,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
         * the caller is !atomic then it'll uselessly search the free
         * list. That corner case is then slower but it is harmless.
         */
-       if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+       if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
                return true;
 
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3205,6 +3311,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                       ac_classzone_idx(ac), alloc_flags)) {
                        int ret;
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+                       /*
+                        * Watermark failed for this zone, but see if we can
+                        * grow this zone if it contains deferred pages.
+                        */
+                       if (static_branch_unlikely(&deferred_pages)) {
+                               if (_deferred_grow_zone(zone, order))
+                                       goto try_this_zone;
+                       }
+#endif
                        /* Checked here to keep the fast path fast */
                        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
@@ -3246,6 +3362,14 @@ try_this_zone:
                                reserve_highatomic_pageblock(page, zone, order);
 
                        return page;
+               } else {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+                       /* Try again if zone has deferred pages */
+                       if (static_branch_unlikely(&deferred_pages)) {
+                               if (_deferred_grow_zone(zone, order))
+                                       goto try_this_zone;
+                       }
+#endif
                }
        }
 
@@ -3685,16 +3809,18 @@ retry:
        return page;
 }
 
-static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
+static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
+                            const struct alloc_context *ac)
 {
        struct zoneref *z;
        struct zone *zone;
        pg_data_t *last_pgdat = NULL;
+       enum zone_type high_zoneidx = ac->high_zoneidx;
 
-       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                       ac->high_zoneidx, ac->nodemask) {
+       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+                                       ac->nodemask) {
                if (last_pgdat != zone->zone_pgdat)
-                       wakeup_kswapd(zone, order, ac->high_zoneidx);
+                       wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
                last_pgdat = zone->zone_pgdat;
        }
 }
@@ -3730,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
-#ifdef CONFIG_CMA
-       if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-               alloc_flags |= ALLOC_CMA;
-#endif
        return alloc_flags;
 }
 
@@ -3973,7 +4095,7 @@ retry_cpuset:
                goto nopage;
 
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
-               wake_all_kswapds(order, ac);
+               wake_all_kswapds(order, gfp_mask, ac);
 
        /*
         * The adjusted alloc_flags might result in immediate success, so try
@@ -4031,7 +4153,7 @@ retry_cpuset:
 retry:
        /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
-               wake_all_kswapds(order, ac);
+               wake_all_kswapds(order, gfp_mask, ac);
 
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
        if (reserve_flags)
@@ -4200,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
        if (should_fail_alloc_page(gfp_mask, order))
                return false;
 
-       if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
-               *alloc_flags |= ALLOC_CMA;
-
        return true;
 }
 
@@ -4612,6 +4731,13 @@ long si_mem_available(void)
                     min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
                         wmark_low);
 
+       /*
+        * Part of the kernel memory, which can be released under memory
+        * pressure.
+        */
+       available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
+               PAGE_SHIFT;
+
        if (available < 0)
                available = 0;
        return available;
@@ -5334,6 +5460,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
        unsigned long nr_initialised = 0;
+       struct page *page;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        struct memblock_region *r = NULL, *tmp;
 #endif
@@ -5386,6 +5513,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 #endif
 
 not_early:
+               page = pfn_to_page(pfn);
+               __init_single_page(page, pfn, zone, nid);
+               if (context == MEMMAP_HOTPLUG)
+                       SetPageReserved(page);
+
                /*
                 * Mark the block movable so that blocks are reserved for
                 * movable at startup. This will force kernel allocations
@@ -5402,15 +5534,8 @@ not_early:
                 * because this is done early in sparse_add_one_section
                 */
                if (!(pfn & (pageblock_nr_pages - 1))) {
-                       struct page *page = pfn_to_page(pfn);
-
-                       __init_single_page(page, pfn, zone, nid,
-                                       context != MEMMAP_HOTPLUG);
                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                        cond_resched();
-               } else {
-                       __init_single_pfn(pfn, zone, nid,
-                                       context != MEMMAP_HOTPLUG);
                }
        }
 }
@@ -6079,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
+       unsigned long node_end_pfn = 0;
 
        pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
@@ -6106,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
                unsigned long zone_start_pfn = zone->zone_start_pfn;
+               unsigned long movable_size = 0;
 
                size = zone->spanned_pages;
                realsize = freesize = zone->present_pages;
+               if (zone_end_pfn(zone) > node_end_pfn)
+                       node_end_pfn = zone_end_pfn(zone);
+
 
                /*
                 * Adjust freesize so that it accounts for how much memory
@@ -6157,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                zone_seqlock_init(zone);
                zone_pcp_init(zone);
 
-               if (!size)
+               /*
+                * The size of the CMA area is unknown now so we need to
+                * prepare the memory for the usemap at maximum.
+                */
+               if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
+                       pgdat->node_spanned_pages) {
+                       movable_size = node_end_pfn - pgdat->node_start_pfn;
+               }
+
+               if (!size && !movable_size)
                        continue;
 
                set_pageblock_order();
-               setup_usemap(pgdat, zone, zone_start_pfn, size);
-               init_currently_empty_zone(zone, zone_start_pfn, size);
+               if (movable_size) {
+                       zone->zone_start_pfn = pgdat->node_start_pfn;
+                       zone->spanned_pages = movable_size;
+                       setup_usemap(pgdat, zone,
+                               pgdat->node_start_pfn, movable_size);
+                       init_currently_empty_zone(zone,
+                               pgdat->node_start_pfn, movable_size);
+               } else {
+                       setup_usemap(pgdat, zone, zone_start_pfn, size);
+                       init_currently_empty_zone(zone, zone_start_pfn, size);
+               }
                memmap_init(size, nid, j, zone_start_pfn);
        }
 }
@@ -6192,10 +6340,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
                end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
-               map = alloc_remap(pgdat->node_id, size);
-               if (!map)
-                       map = memblock_virt_alloc_node_nopanic(size,
-                                                              pgdat->node_id);
+               map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
                pgdat->node_mem_map = map + offset;
        }
        pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6244,7 +6389,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 
        alloc_node_mem_map(pgdat);
 
-       reset_deferred_meminit(pgdat);
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+       /*
+        * We start only with one section of pages, more pages are added as
+        * needed until the rest of deferred pages are initialized.
+        */
+       pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+                                        pgdat->node_spanned_pages);
+       pgdat->first_deferred_pfn = ULONG_MAX;
+#endif
        free_area_init_core(pgdat);
 }
 
@@ -6474,7 +6627,18 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        }
 
        /*
-        * If movablecore=nn[KMG] was specified, calculate what size of
+        * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+        * amount of necessary memory.
+        */
+       if (required_kernelcore_percent)
+               required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+                                      10000UL;
+       if (required_movablecore_percent)
+               required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+                                       10000UL;
+
+       /*
+        * If movablecore= was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
         * any allocation type is evenly spread. If both kernelcore
         * and movablecore are specified, then the value of kernelcore
@@ -6714,18 +6878,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        zero_resv_unavail();
 }
 
-static int __init cmdline_parse_core(char *p, unsigned long *core)
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+                                    unsigned long *percent)
 {
        unsigned long long coremem;
+       char *endptr;
+
        if (!p)
                return -EINVAL;
 
-       coremem = memparse(p, &p);
-       *core = coremem >> PAGE_SHIFT;
+       /* Value may be a percentage of total memory, otherwise bytes */
+       coremem = simple_strtoull(p, &endptr, 0);
+       if (*endptr == '%') {
+               /* Paranoid check for percent values greater than 100 */
+               WARN_ON(coremem > 100);
 
-       /* Paranoid check that UL is enough for the coremem value */
-       WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+               *percent = coremem;
+       } else {
+               coremem = memparse(p, &p);
+               /* Paranoid check that UL is enough for the coremem value */
+               WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 
+               *core = coremem >> PAGE_SHIFT;
+               *percent = 0UL;
+       }
        return 0;
 }
 
@@ -6741,7 +6917,8 @@ static int __init cmdline_parse_kernelcore(char *p)
                return 0;
        }
 
-       return cmdline_parse_core(p, &required_kernelcore);
+       return cmdline_parse_core(p, &required_kernelcore,
+                                 &required_kernelcore_percent);
 }
 
 /*
@@ -6750,7 +6927,8 @@ static int __init cmdline_parse_kernelcore(char *p)
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
-       return cmdline_parse_core(p, &required_movablecore);
+       return cmdline_parse_core(p, &required_movablecore,
+                                 &required_movablecore_percent);
 }
 
 early_param("kernelcore", cmdline_parse_kernelcore);
@@ -6974,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void)
                                struct zone *lower_zone;
 
                                idx--;
-
-                               if (sysctl_lowmem_reserve_ratio[idx] < 1)
-                                       sysctl_lowmem_reserve_ratio[idx] = 1;
-
                                lower_zone = pgdat->node_zones + idx;
-                               lower_zone->lowmem_reserve[j] = managed_pages /
-                                       sysctl_lowmem_reserve_ratio[idx];
+
+                               if (sysctl_lowmem_reserve_ratio[idx] < 1) {
+                                       sysctl_lowmem_reserve_ratio[idx] = 0;
+                                       lower_zone->lowmem_reserve[j] = 0;
+                               } else {
+                                       lower_zone->lowmem_reserve[j] =
+                                               managed_pages / sysctl_lowmem_reserve_ratio[idx];
+                               }
                                managed_pages += lower_zone->managed_pages;
                        }
                }
@@ -7594,7 +7774,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                cc->nr_migratepages -= nr_reclaimed;
 
                ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-                                   NULL, 0, cc->mode, MR_CMA);
+                                   NULL, 0, cc->mode, MR_CONTIG_RANGE);
        }
        if (ret < 0) {
                putback_movable_pages(&cc->migratepages);
@@ -7614,11 +7794,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * @gfp_mask:  GFP mask to use during compaction
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned, however it's the caller's responsibility to guarantee that
- * we are the only thread that changes migrate type of pageblocks the
- * pages fall in.
+ * aligned.  The PFN range must belong to a single zone.
  *
- * The PFN range must belong to a single zone.
+ * The first thing this routine does is attempt to MIGRATE_ISOLATE all
+ * pageblocks in the range.  Once isolated, the pageblocks should not
+ * be modified by others.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
@@ -7771,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.