Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[sfrench/cifs-2.6.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 1741dd23e7c1f7d4878cf38ff9ba021928f99267..905db9d7962fcb1776c0e7ffb1618fb6e4084a75 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,7 +46,6 @@
  #include <linux/stop_machine.h>
  #include <linux/sort.h>
  #include <linux/pfn.h>
-#include <xen/xen.h>
  #include <linux/backing-dev.h>
  #include <linux/fault-inject.h>
  #include <linux/page-isolation.h>
@@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order);
   * TBD: should special case ZONE_DMA32 machines here - in those we normally
   * don't need any ZONE_NORMAL reservation
   */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
  #ifdef CONFIG_ZONE_DMA
-        256,
+       [ZONE_DMA] = 256,
  #endif
  #ifdef CONFIG_ZONE_DMA32
-        256,
+       [ZONE_DMA32] = 256,
  #endif
+       [ZONE_NORMAL] = 32,
  #ifdef CONFIG_HIGHMEM
-        32,
+       [ZONE_HIGHMEM] = 0,
  #endif
-        32,
+       [ZONE_MOVABLE] = 0,
  };
  
  EXPORT_SYMBOL(totalram_pages);
@@ -265,17 +265,19 @@ int min_free_kbytes = 1024;
  int user_min_free_kbytes = -1;
  int watermark_scale_factor = 10;
  
-static unsigned long __meminitdata nr_kernel_pages;
-static unsigned long __meminitdata nr_all_pages;
-static unsigned long __meminitdata dma_reserve;
+static unsigned long nr_kernel_pages __meminitdata;
+static unsigned long nr_all_pages __meminitdata;
+static unsigned long dma_reserve __meminitdata;
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-static unsigned long __initdata required_kernelcore;
-static unsigned long __initdata required_movablecore;
-static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-static bool mirrored_kernelcore;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static bool mirrored_kernelcore __meminitdata;
  
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
@@ -292,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes);
  int page_group_by_mobility_disabled __read_mostly;
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
-/*
- * Determine how many pages need to be initialized during early boot
- * (non-deferred initialization).
- * The value of first_deferred_pfn will be set later, once non-deferred pages
- * are initialized, but for now set it ULONG_MAX.
- */
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
-       phys_addr_t start_addr, end_addr;
-       unsigned long max_pgcnt;
-       unsigned long reserved;
-
-       /*
-        * Initialise at least 2G of a node but also take into account that
-        * two large system hashes that can take up 1GB for 0.25TB/node.
-        */
-       max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
-                       (pgdat->node_spanned_pages >> 8));
-
-       /*
-        * Compensate the all the memblock reservations (e.g. crash kernel)
-        * from the initial estimation to make sure we will initialize enough
-        * memory to boot.
-        */
-       start_addr = PFN_PHYS(pgdat->node_start_pfn);
-       end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
-       reserved = memblock_reserved_memory_within(start_addr, end_addr);
-       max_pgcnt += PHYS_PFN(reserved);
-
-       pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
-       pgdat->first_deferred_pfn = ULONG_MAX;
-}
-
  /* Returns true if the struct page for the pfn is uninitialised */
  static inline bool __meminit early_page_uninitialised(unsigned long pfn)
  {
@@ -348,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
         /* Always populate low zones for address-constrained allocations */
         if (zone_end < pgdat_end_pfn(pgdat))
                 return true;
-       /* Xen PV domains need page structures early */
-       if (xen_pv_domain())
-               return true;
         (*nr_initialised)++;
         if ((*nr_initialised > pgdat->static_init_pgcnt) &&
             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
@@ -361,10 +326,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
         return true;
  }
  #else
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
-}
-
  static inline bool early_page_uninitialised(unsigned long pfn)
  {
         return false;
@@ -1099,6 +1060,15 @@ static bool bulkfree_pcp_prepare(struct page *page)
  }
  #endif /* CONFIG_DEBUG_VM */
  
+static inline void prefetch_buddy(struct page *page)
+{
+       unsigned long pfn = page_to_pfn(page);
+       unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
+       struct page *buddy = page + (buddy_pfn - pfn);
+
+       prefetch(buddy);
+}
+
  /*
   * Frees a number of pages from the PCP lists
   * Assumes all pages on list are in same zone, and of same order.
@@ -1115,13 +1085,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
+       int prefetch_nr = 0;
         bool isolated_pageblocks;
-
-       spin_lock(&zone->lock);
-       isolated_pageblocks = has_isolate_pageblock(zone);
+       struct page *page, *tmp;
+       LIST_HEAD(head);
  
         while (count) {
-               struct page *page;
                 struct list_head *list;
  
                 /*
@@ -1143,26 +1112,48 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         batch_free = count;
  
                 do {
-                       int mt; /* migratetype of the to-be-freed page */
-
                         page = list_last_entry(list, struct page, lru);
-                       /* must delete as __free_one_page list manipulates */
+                       /* must delete to avoid corrupting pcp list */
                         list_del(&page->lru);
-
-                       mt = get_pcppage_migratetype(page);
-                       /* MIGRATE_ISOLATE page should not go to pcplists */
-                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
-                       /* Pageblock could have been isolated meanwhile */
-                       if (unlikely(isolated_pageblocks))
-                               mt = get_pageblock_migratetype(page);
+                       pcp->count--;
  
                         if (bulkfree_pcp_prepare(page))
                                 continue;
  
-                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
-                       trace_mm_page_pcpu_drain(page, 0, mt);
+                       list_add_tail(&page->lru, &head);
+
+                       /*
+                        * We are going to put the page back to the global
+                        * pool, prefetch its buddy to speed up later access
+                        * under zone->lock. It is believed the overhead of
+                        * an additional test and calculating buddy_pfn here
+                        * can be offset by reduced memory latency later. To
+                        * avoid excessive prefetching due to large count, only
+                        * prefetch buddy for the first pcp->batch nr of pages.
+                        */
+                       if (prefetch_nr++ < pcp->batch)
+                               prefetch_buddy(page);
                 } while (--count && --batch_free && !list_empty(list));
         }
+
+       spin_lock(&zone->lock);
+       isolated_pageblocks = has_isolate_pageblock(zone);
+
+       /*
+        * Use safe version since after __free_one_page(),
+        * page->lru.next will not point to original list.
+        */
+       list_for_each_entry_safe(page, tmp, &head, lru) {
+               int mt = get_pcppage_migratetype(page);
+               /* MIGRATE_ISOLATE page should not go to pcplists */
+               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+               /* Pageblock could have been isolated meanwhile */
+               if (unlikely(isolated_pageblocks))
+                       mt = get_pageblock_migratetype(page);
+
+               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+               trace_mm_page_pcpu_drain(page, 0, mt);
+       }
         spin_unlock(&zone->lock);
  }
  
@@ -1181,10 +1172,9 @@ static void free_one_page(struct zone *zone,
  }
  
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-                               unsigned long zone, int nid, bool zero)
+                               unsigned long zone, int nid)
  {
-       if (zero)
-               mm_zero_struct_page(page);
+       mm_zero_struct_page(page);
         set_page_links(page, zone, nid, pfn);
         init_page_count(page);
         page_mapcount_reset(page);
@@ -1198,12 +1188,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
  #endif
  }
  
-static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
-                                       int nid, bool zero)
-{
-       return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
-}
-
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
  static void __meminit init_reserved_page(unsigned long pfn)
  {
@@ -1222,7 +1206,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
                 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
                         break;
         }
-       __init_single_pfn(pfn, zid, nid, true);
+       __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
  }
  #else
  static inline void init_reserved_page(unsigned long pfn)
@@ -1506,7 +1490,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
                 } else if (!(pfn & nr_pgmask)) {
                         deferred_free_range(pfn - nr_free, nr_free);
                         nr_free = 1;
-                       cond_resched();
+                       touch_nmi_watchdog();
                 } else {
                         nr_free++;
                 }
@@ -1535,11 +1519,11 @@ static unsigned long  __init deferred_init_pages(int nid, int zid,
                         continue;
                 } else if (!page || !(pfn & nr_pgmask)) {
                         page = pfn_to_page(pfn);
-                       cond_resched();
+                       touch_nmi_watchdog();
                 } else {
                         page++;
                 }
-               __init_single_page(page, pfn, zid, nid, true);
+               __init_single_page(page, pfn, zid, nid);
                 nr_pages++;
         }
         return (nr_pages);
@@ -1552,23 +1536,25 @@ static int __init deferred_init_memmap(void *data)
         int nid = pgdat->node_id;
         unsigned long start = jiffies;
         unsigned long nr_pages = 0;
-       unsigned long spfn, epfn;
+       unsigned long spfn, epfn, first_init_pfn, flags;
         phys_addr_t spa, epa;
         int zid;
         struct zone *zone;
-       unsigned long first_init_pfn = pgdat->first_deferred_pfn;
         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
         u64 i;
  
+       /* Bind memory initialisation thread to a local node if possible */
+       if (!cpumask_empty(cpumask))
+               set_cpus_allowed_ptr(current, cpumask);
+
+       pgdat_resize_lock(pgdat, &flags);
+       first_init_pfn = pgdat->first_deferred_pfn;
         if (first_init_pfn == ULONG_MAX) {
+               pgdat_resize_unlock(pgdat, &flags);
                 pgdat_init_report_one_done();
                 return 0;
         }
  
-       /* Bind memory initialisation thread to a local node if possible */
-       if (!cpumask_empty(cpumask))
-               set_cpus_allowed_ptr(current, cpumask);
-
         /* Sanity check boundaries */
         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
@@ -1598,6 +1584,7 @@ static int __init deferred_init_memmap(void *data)
                 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
                 deferred_free_pages(nid, zid, spfn, epfn);
         }
+       pgdat_resize_unlock(pgdat, &flags);
  
         /* Sanity check that the next zone really is unpopulated */
         WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1608,6 +1595,117 @@ static int __init deferred_init_memmap(void *data)
         pgdat_init_report_one_done();
         return 0;
  }
+
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+static noinline bool __init
+deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+       int zid = zone_idx(zone);
+       int nid = zone_to_nid(zone);
+       pg_data_t *pgdat = NODE_DATA(nid);
+       unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+       unsigned long nr_pages = 0;
+       unsigned long first_init_pfn, spfn, epfn, t, flags;
+       unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+       phys_addr_t spa, epa;
+       u64 i;
+
+       /* Only the last zone may have deferred pages */
+       if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+               return false;
+
+       pgdat_resize_lock(pgdat, &flags);
+
+       /*
+        * If deferred pages have been initialized while we were waiting for
+        * the lock, return true, as the zone was grown.  The caller will retry
+        * this zone.  We won't return to this function since the caller also
+        * has this static branch.
+        */
+       if (!static_branch_unlikely(&deferred_pages)) {
+               pgdat_resize_unlock(pgdat, &flags);
+               return true;
+       }
+
+       /*
+        * If someone grew this zone while we were waiting for spinlock, return
+        * true, as there might be enough pages already.
+        */
+       if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+               pgdat_resize_unlock(pgdat, &flags);
+               return true;
+       }
+
+       first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
+
+       if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+               pgdat_resize_unlock(pgdat, &flags);
+               return false;
+       }
+
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+
+               while (spfn < epfn && nr_pages < nr_pages_needed) {
+                       t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
+                       first_deferred_pfn = min(t, epfn);
+                       nr_pages += deferred_init_pages(nid, zid, spfn,
+                                                       first_deferred_pfn);
+                       spfn = first_deferred_pfn;
+               }
+
+               if (nr_pages >= nr_pages_needed)
+                       break;
+       }
+
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+               epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
+               deferred_free_pages(nid, zid, spfn, epfn);
+
+               if (first_deferred_pfn == epfn)
+                       break;
+       }
+       pgdat->first_deferred_pfn = first_deferred_pfn;
+       pgdat_resize_unlock(pgdat, &flags);
+
+       return nr_pages > 0;
+}
+
+/*
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
+ */
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+       return deferred_grow_zone(zone, order);
+}
+
  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
  
  void __init page_alloc_init_late(void)
@@ -1626,6 +1724,12 @@ void __init page_alloc_init_late(void)
         /* Block until all are initialised */
         wait_for_completion(&pgdat_init_all_done_comp);
  
+       /*
+        * We initialized the rest of the deferred pages.  Permanently disable
+        * on-demand struct page initialization.
+        */
+       static_branch_disable(&deferred_pages);
+
         /* Reinit limits that are based on free pages after the kernel is up */
         files_maxfiles_init();
  #endif
@@ -1639,16 +1743,38 @@ void __init page_alloc_init_late(void)
  }
  
  #ifdef CONFIG_CMA
+static void __init adjust_present_page_count(struct page *page, long count)
+{
+       struct zone *zone = page_zone(page);
+
+       /* We don't need to hold a lock since it is boot-up process */
+       zone->present_pages += count;
+}
+
  /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
  void __init init_cma_reserved_pageblock(struct page *page)
  {
         unsigned i = pageblock_nr_pages;
+       unsigned long pfn = page_to_pfn(page);
         struct page *p = page;
+       int nid = page_to_nid(page);
+
+       /*
+        * ZONE_MOVABLE will steal present pages from other zones by
+        * changing page links so page_zone() is changed. Before that,
+        * we need to adjust previous zone's page count first.
+        */
+       adjust_present_page_count(page, -pageblock_nr_pages);
  
         do {
                 __ClearPageReserved(p);
                 set_page_count(p, 0);
-       } while (++p, --i);
+
+               /* Steal pages from other zones */
+               set_page_links(p, ZONE_MOVABLE, nid, pfn);
+       } while (++p, ++pfn, --i);
+
+       adjust_present_page_count(page, pageblock_nr_pages);
  
         set_pageblock_migratetype(page, MIGRATE_CMA);
  
@@ -2418,10 +2544,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
         local_irq_save(flags);
         batch = READ_ONCE(pcp->batch);
         to_drain = min(pcp->count, batch);
-       if (to_drain > 0) {
+       if (to_drain > 0)
                 free_pcppages_bulk(zone, to_drain, pcp);
-               pcp->count -= to_drain;
-       }
         local_irq_restore(flags);
  }
  #endif
@@ -2443,10 +2567,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
         pset = per_cpu_ptr(zone->pageset, cpu);
  
         pcp = &pset->pcp;
-       if (pcp->count) {
+       if (pcp->count)
                 free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
-       }
         local_irq_restore(flags);
  }
  
@@ -2670,7 +2792,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
         if (pcp->count >= pcp->high) {
                 unsigned long batch = READ_ONCE(pcp->batch);
                 free_pcppages_bulk(zone, batch, pcp);
-               pcp->count -= batch;
         }
  }
  
@@ -2768,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
                  * exists.
                  */
                 watermark = min_wmark_pages(zone) + (1UL << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
                         return 0;
  
                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3044,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
         }
  
  
-#ifdef CONFIG_CMA
-       /* If allocation can't use CMA areas don't use free CMA pages */
-       if (!(alloc_flags & ALLOC_CMA))
-               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
         /*
          * Check watermarks for an order-0 allocation request. If these
          * are not met, then a high-order request also cannot go ahead
@@ -3076,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                 }
  
  #ifdef CONFIG_CMA
-               if ((alloc_flags & ALLOC_CMA) &&
-                   !list_empty(&area->free_list[MIGRATE_CMA])) {
+               if (!list_empty(&area->free_list[MIGRATE_CMA]))
                         return true;
-               }
  #endif
                 if (alloc_harder &&
                         !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3099,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
                 unsigned long mark, int classzone_idx, unsigned int alloc_flags)
  {
         long free_pages = zone_page_state(z, NR_FREE_PAGES);
-       long cma_pages = 0;
-
-#ifdef CONFIG_CMA
-       /* If allocation can't use CMA areas don't use free CMA pages */
-       if (!(alloc_flags & ALLOC_CMA))
-               cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
  
         /*
          * Fast check for order-0 only. If this fails then the reserves
@@ -3114,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
          * the caller is !atomic then it'll uselessly search the free
          * list. That corner case is then slower but it is harmless.
          */
-       if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+       if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
                 return true;
  
         return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3205,6 +3311,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                        ac_classzone_idx(ac), alloc_flags)) {
                         int ret;
  
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+                       /*
+                        * Watermark failed for this zone, but see if we can
+                        * grow this zone if it contains deferred pages.
+                        */
+                       if (static_branch_unlikely(&deferred_pages)) {
+                               if (_deferred_grow_zone(zone, order))
+                                       goto try_this_zone;
+                       }
+#endif
                         /* Checked here to keep the fast path fast */
                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                         if (alloc_flags & ALLOC_NO_WATERMARKS)
@@ -3246,6 +3362,14 @@ try_this_zone:
                                 reserve_highatomic_pageblock(page, zone, order);
  
                         return page;
+               } else {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+                       /* Try again if zone has deferred pages */
+                       if (static_branch_unlikely(&deferred_pages)) {
+                               if (_deferred_grow_zone(zone, order))
+                                       goto try_this_zone;
+                       }
+#endif
                 }
         }
  
@@ -3685,16 +3809,18 @@ retry:
         return page;
  }
  
-static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
+static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
+                            const struct alloc_context *ac)
  {
         struct zoneref *z;
         struct zone *zone;
         pg_data_t *last_pgdat = NULL;
+       enum zone_type high_zoneidx = ac->high_zoneidx;
  
-       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                       ac->high_zoneidx, ac->nodemask) {
+       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+                                       ac->nodemask) {
                 if (last_pgdat != zone->zone_pgdat)
-                       wakeup_kswapd(zone, order, ac->high_zoneidx);
+                       wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
                 last_pgdat = zone->zone_pgdat;
         }
  }
@@ -3730,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         } else if (unlikely(rt_task(current)) && !in_interrupt())
                 alloc_flags |= ALLOC_HARDER;
  
-#ifdef CONFIG_CMA
-       if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-               alloc_flags |= ALLOC_CMA;
-#endif
         return alloc_flags;
  }
  
@@ -3973,7 +4095,7 @@ retry_cpuset:
                 goto nopage;
  
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
-               wake_all_kswapds(order, ac);
+               wake_all_kswapds(order, gfp_mask, ac);
  
         /*
          * The adjusted alloc_flags might result in immediate success, so try
@@ -4031,7 +4153,7 @@ retry_cpuset:
  retry:
         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
-               wake_all_kswapds(order, ac);
+               wake_all_kswapds(order, gfp_mask, ac);
  
         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
         if (reserve_flags)
@@ -4200,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
         if (should_fail_alloc_page(gfp_mask, order))
                 return false;
  
-       if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
-               *alloc_flags |= ALLOC_CMA;
-
         return true;
  }
  
@@ -4612,6 +4731,13 @@ long si_mem_available(void)
                      min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
                          wmark_low);
  
+       /*
+        * Part of the kernel memory, which can be released under memory
+        * pressure.
+        */
+       available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
+               PAGE_SHIFT;
+
         if (available < 0)
                 available = 0;
         return available;
@@ -5334,6 +5460,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
         pg_data_t *pgdat = NODE_DATA(nid);
         unsigned long pfn;
         unsigned long nr_initialised = 0;
+       struct page *page;
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
         struct memblock_region *r = NULL, *tmp;
  #endif
@@ -5386,6 +5513,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
  #endif
  
  not_early:
+               page = pfn_to_page(pfn);
+               __init_single_page(page, pfn, zone, nid);
+               if (context == MEMMAP_HOTPLUG)
+                       SetPageReserved(page);
+
                 /*
                  * Mark the block movable so that blocks are reserved for
                  * movable at startup. This will force kernel allocations
@@ -5402,15 +5534,8 @@ not_early:
                  * because this is done early in sparse_add_one_section
                  */
                 if (!(pfn & (pageblock_nr_pages - 1))) {
-                       struct page *page = pfn_to_page(pfn);
-
-                       __init_single_page(page, pfn, zone, nid,
-                                       context != MEMMAP_HOTPLUG);
                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                         cond_resched();
-               } else {
-                       __init_single_pfn(pfn, zone, nid,
-                                       context != MEMMAP_HOTPLUG);
                 }
         }
  }
@@ -6079,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
  {
         enum zone_type j;
         int nid = pgdat->node_id;
+       unsigned long node_end_pfn = 0;
  
         pgdat_resize_init(pgdat);
  #ifdef CONFIG_NUMA_BALANCING
@@ -6106,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long size, realsize, freesize, memmap_pages;
                 unsigned long zone_start_pfn = zone->zone_start_pfn;
+               unsigned long movable_size = 0;
  
                 size = zone->spanned_pages;
                 realsize = freesize = zone->present_pages;
+               if (zone_end_pfn(zone) > node_end_pfn)
+                       node_end_pfn = zone_end_pfn(zone);
+
  
                 /*
                  * Adjust freesize so that it accounts for how much memory
@@ -6157,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                 zone_seqlock_init(zone);
                 zone_pcp_init(zone);
  
-               if (!size)
+               /*
+                * The size of the CMA area is unknown now so we need to
+                * prepare the memory for the usemap at maximum.
+                */
+               if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
+                       pgdat->node_spanned_pages) {
+                       movable_size = node_end_pfn - pgdat->node_start_pfn;
+               }
+
+               if (!size && !movable_size)
                         continue;
  
                 set_pageblock_order();
-               setup_usemap(pgdat, zone, zone_start_pfn, size);
-               init_currently_empty_zone(zone, zone_start_pfn, size);
+               if (movable_size) {
+                       zone->zone_start_pfn = pgdat->node_start_pfn;
+                       zone->spanned_pages = movable_size;
+                       setup_usemap(pgdat, zone,
+                               pgdat->node_start_pfn, movable_size);
+                       init_currently_empty_zone(zone,
+                               pgdat->node_start_pfn, movable_size);
+               } else {
+                       setup_usemap(pgdat, zone, zone_start_pfn, size);
+                       init_currently_empty_zone(zone, zone_start_pfn, size);
+               }
                 memmap_init(size, nid, j, zone_start_pfn);
         }
  }
@@ -6192,10 +6340,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
                 end = pgdat_end_pfn(pgdat);
                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
                 size =  (end - start) * sizeof(struct page);
-               map = alloc_remap(pgdat->node_id, size);
-               if (!map)
-                       map = memblock_virt_alloc_node_nopanic(size,
-                                                              pgdat->node_id);
+               map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
                 pgdat->node_mem_map = map + offset;
         }
         pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6244,7 +6389,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  
         alloc_node_mem_map(pgdat);
  
-       reset_deferred_meminit(pgdat);
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+       /*
+        * We start only with one section of pages, more pages are added as
+        * needed until the rest of deferred pages are initialized.
+        */
+       pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+                                        pgdat->node_spanned_pages);
+       pgdat->first_deferred_pfn = ULONG_MAX;
+#endif
         free_area_init_core(pgdat);
  }
  
@@ -6474,7 +6627,18 @@ static void __init find_zone_movable_pfns_for_nodes(void)
         }
  
         /*
-        * If movablecore=nn[KMG] was specified, calculate what size of
+        * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+        * amount of necessary memory.
+        */
+       if (required_kernelcore_percent)
+               required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+                                      10000UL;
+       if (required_movablecore_percent)
+               required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+                                       10000UL;
+
+       /*
+        * If movablecore= was specified, calculate what size of
          * kernelcore that corresponds so that memory usable for
          * any allocation type is evenly spread. If both kernelcore
          * and movablecore are specified, then the value of kernelcore
@@ -6714,18 +6878,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
         zero_resv_unavail();
  }
  
-static int __init cmdline_parse_core(char *p, unsigned long *core)
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+                                    unsigned long *percent)
  {
         unsigned long long coremem;
+       char *endptr;
+
         if (!p)
                 return -EINVAL;
  
-       coremem = memparse(p, &p);
-       *core = coremem >> PAGE_SHIFT;
+       /* Value may be a percentage of total memory, otherwise bytes */
+       coremem = simple_strtoull(p, &endptr, 0);
+       if (*endptr == '%') {
+               /* Paranoid check for percent values greater than 100 */
+               WARN_ON(coremem > 100);
  
-       /* Paranoid check that UL is enough for the coremem value */
-       WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+               *percent = coremem;
+       } else {
+               coremem = memparse(p, &p);
+               /* Paranoid check that UL is enough for the coremem value */
+               WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
  
+               *core = coremem >> PAGE_SHIFT;
+               *percent = 0UL;
+       }
         return 0;
  }
  
@@ -6741,7 +6917,8 @@ static int __init cmdline_parse_kernelcore(char *p)
                 return 0;
         }
  
-       return cmdline_parse_core(p, &required_kernelcore);
+       return cmdline_parse_core(p, &required_kernelcore,
+                                 &required_kernelcore_percent);
  }
  
  /*
@@ -6750,7 +6927,8 @@ static int __init cmdline_parse_kernelcore(char *p)
   */
  static int __init cmdline_parse_movablecore(char *p)
  {
-       return cmdline_parse_core(p, &required_movablecore);
+       return cmdline_parse_core(p, &required_movablecore,
+                                 &required_movablecore_percent);
  }
  
  early_param("kernelcore", cmdline_parse_kernelcore);
@@ -6974,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void)
                                 struct zone *lower_zone;
  
                                 idx--;
-
-                               if (sysctl_lowmem_reserve_ratio[idx] < 1)
-                                       sysctl_lowmem_reserve_ratio[idx] = 1;
-
                                 lower_zone = pgdat->node_zones + idx;
-                               lower_zone->lowmem_reserve[j] = managed_pages /
-                                       sysctl_lowmem_reserve_ratio[idx];
+
+                               if (sysctl_lowmem_reserve_ratio[idx] < 1) {
+                                       sysctl_lowmem_reserve_ratio[idx] = 0;
+                                       lower_zone->lowmem_reserve[j] = 0;
+                               } else {
+                                       lower_zone->lowmem_reserve[j] =
+                                               managed_pages / sysctl_lowmem_reserve_ratio[idx];
+                               }
                                 managed_pages += lower_zone->managed_pages;
                         }
                 }
@@ -7594,7 +7774,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                 cc->nr_migratepages -= nr_reclaimed;
  
                 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-                                   NULL, 0, cc->mode, MR_CMA);
+                                   NULL, 0, cc->mode, MR_CONTIG_RANGE);
         }
         if (ret < 0) {
                 putback_movable_pages(&cc->migratepages);
@@ -7614,11 +7794,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
   * @gfp_mask:  GFP mask to use during compaction
   *
   * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned, however it's the caller's responsibility to guarantee that
- * we are the only thread that changes migrate type of pageblocks the
- * pages fall in.
+ * aligned.  The PFN range must belong to a single zone.
   *
- * The PFN range must belong to a single zone.
+ * The first thing this routine does is attempt to MIGRATE_ISOLATE all
+ * pageblocks in the range.  Once isolated, the pageblocks should not
+ * be modified by others.
   *
   * Returns zero on success or negative error code.  On success all
   * pages which PFN is in [start, end) are allocated for the caller and
@@ -7771,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
  }
  #endif
  
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
  /*
   * The zone indicated has a new number of managed_pages; batch sizes and percpu
   * page high values need to be recalulated.