mm: stop zeroing memory during allocation in vmemmap

[sfrench/cifs-2.6.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index e6106d7e9eb04d3460692985167811faf943b8e1..39e847cd1484582b6e46ba6807688c789bce9953 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
  #include <linux/memblock.h>
  #include <linux/compiler.h>
  #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
  #include <linux/kasan.h>
  #include <linux/module.h>
  #include <linux/suspend.h>
@@ -1013,7 +1012,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
         VM_BUG_ON_PAGE(PageTail(page), page);
  
         trace_mm_page_free(page, order);
-       kmemcheck_free_shadow(page, order);
  
         /*
          * Check tail pages before head page information is cleared to
@@ -1170,6 +1168,7 @@ static void free_one_page(struct zone *zone,
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                 unsigned long zone, int nid)
  {
+       mm_zero_struct_page(page);
         set_page_links(page, zone, nid, pfn);
         init_page_count(page);
         page_mapcount_reset(page);
@@ -1410,14 +1409,17 @@ void clear_zone_contiguous(struct zone *zone)
  }
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(struct page *page,
-                                       unsigned long pfn, int nr_pages)
+static void __init deferred_free_range(unsigned long pfn,
+                                      unsigned long nr_pages)
  {
-       int i;
+       struct page *page;
+       unsigned long i;
  
-       if (!page)
+       if (!nr_pages)
                 return;
  
+       page = pfn_to_page(pfn);
+
         /* Free a large naturally-aligned chunk if possible */
         if (nr_pages == pageblock_nr_pages &&
             (pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1445,109 @@ static inline void __init pgdat_init_report_one_done(void)
                 complete(&pgdat_init_all_done_comp);
  }
  
+/*
+ * Helper for deferred_init_range, free the given range, reset the counters, and
+ * return number of pages freed.
+ */
+static inline unsigned long __init __def_free(unsigned long *nr_free,
+                                             unsigned long *free_base_pfn,
+                                             struct page **page)
+{
+       unsigned long nr = *nr_free;
+
+       deferred_free_range(*free_base_pfn, nr);
+       *free_base_pfn = 0;
+       *nr_free = 0;
+       *page = NULL;
+
+       return nr;
+}
+
+static unsigned long __init deferred_init_range(int nid, int zid,
+                                               unsigned long start_pfn,
+                                               unsigned long end_pfn)
+{
+       struct mminit_pfnnid_cache nid_init_state = { };
+       unsigned long nr_pgmask = pageblock_nr_pages - 1;
+       unsigned long free_base_pfn = 0;
+       unsigned long nr_pages = 0;
+       unsigned long nr_free = 0;
+       struct page *page = NULL;
+       unsigned long pfn;
+
+       /*
+        * First we check if pfn is valid on architectures where it is possible
+        * to have holes within pageblock_nr_pages. On systems where it is not
+        * possible, this function is optimized out.
+        *
+        * Then, we check if a current large page is valid by only checking the
+        * validity of the head pfn.
+        *
+        * meminit_pfn_in_nid is checked on systems where pfns can interleave
+        * within a node: a pfn is between start and end of a node, but does not
+        * belong to this memory node.
+        *
+        * Finally, we minimize pfn page lookups and scheduler checks by
+        * performing it only once every pageblock_nr_pages.
+        *
+        * We do it in two loops: first we initialize struct page, than free to
+        * buddy allocator, becuse while we are freeing pages we can access
+        * pages that are ahead (computing buddy page in __free_one_page()).
+        */
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               if (!pfn_valid_within(pfn))
+                       continue;
+               if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
+                       if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+                               if (page && (pfn & nr_pgmask))
+                                       page++;
+                               else
+                                       page = pfn_to_page(pfn);
+                               __init_single_page(page, pfn, zid, nid);
+                               cond_resched();
+                       }
+               }
+       }
+
+       page = NULL;
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               if (!pfn_valid_within(pfn)) {
+                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+               } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
+                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+               } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+               } else if (page && (pfn & nr_pgmask)) {
+                       page++;
+                       nr_free++;
+               } else {
+                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+                       page = pfn_to_page(pfn);
+                       free_base_pfn = pfn;
+                       nr_free = 1;
+                       cond_resched();
+               }
+       }
+       /* Free the last block of pages to allocator */
+       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+
+       return nr_pages;
+}
+
  /* Initialise remaining memory on a node */
  static int __init deferred_init_memmap(void *data)
  {
         pg_data_t *pgdat = data;
         int nid = pgdat->node_id;
-       struct mminit_pfnnid_cache nid_init_state = { };
         unsigned long start = jiffies;
         unsigned long nr_pages = 0;
-       unsigned long walk_start, walk_end;
-       int i, zid;
+       unsigned long spfn, epfn;
+       phys_addr_t spa, epa;
+       int zid;
         struct zone *zone;
         unsigned long first_init_pfn = pgdat->first_deferred_pfn;
         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+       u64 i;
  
         if (first_init_pfn == ULONG_MAX) {
                 pgdat_init_report_one_done();
@@ -1477,83 +1569,12 @@ static int __init deferred_init_memmap(void *data)
                 if (first_init_pfn < zone_end_pfn(zone))
                         break;
         }
+       first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
  
-       for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
-               unsigned long pfn, end_pfn;
-               struct page *page = NULL;
-               struct page *free_base_page = NULL;
-               unsigned long free_base_pfn = 0;
-               int nr_to_free = 0;
-
-               end_pfn = min(walk_end, zone_end_pfn(zone));
-               pfn = first_init_pfn;
-               if (pfn < walk_start)
-                       pfn = walk_start;
-               if (pfn < zone->zone_start_pfn)
-                       pfn = zone->zone_start_pfn;
-
-               for (; pfn < end_pfn; pfn++) {
-                       if (!pfn_valid_within(pfn))
-                               goto free_range;
-
-                       /*
-                        * Ensure pfn_valid is checked every
-                        * pageblock_nr_pages for memory holes
-                        */
-                       if ((pfn & (pageblock_nr_pages - 1)) == 0) {
-                               if (!pfn_valid(pfn)) {
-                                       page = NULL;
-                                       goto free_range;
-                               }
-                       }
-
-                       if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
-                               page = NULL;
-                               goto free_range;
-                       }
-
-                       /* Minimise pfn page lookups and scheduler checks */
-                       if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
-                               page++;
-                       } else {
-                               nr_pages += nr_to_free;
-                               deferred_free_range(free_base_page,
-                                               free_base_pfn, nr_to_free);
-                               free_base_page = NULL;
-                               free_base_pfn = nr_to_free = 0;
-
-                               page = pfn_to_page(pfn);
-                               cond_resched();
-                       }
-
-                       if (page->flags) {
-                               VM_BUG_ON(page_zone(page) != zone);
-                               goto free_range;
-                       }
-
-                       __init_single_page(page, pfn, zid, nid);
-                       if (!free_base_page) {
-                               free_base_page = page;
-                               free_base_pfn = pfn;
-                               nr_to_free = 0;
-                       }
-                       nr_to_free++;
-
-                       /* Where possible, batch up pages for a single free */
-                       continue;
-free_range:
-                       /* Free the current block of pages to allocator */
-                       nr_pages += nr_to_free;
-                       deferred_free_range(free_base_page, free_base_pfn,
-                                                               nr_to_free);
-                       free_base_page = NULL;
-                       free_base_pfn = nr_to_free = 0;
-               }
-               /* Free the last block of pages to allocator */
-               nr_pages += nr_to_free;
-               deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
-
-               first_init_pfn = max(end_pfn, first_init_pfn);
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+               nr_pages += deferred_init_range(nid, zid, spfn, epfn);
         }
  
         /* Sanity check that the next zone really is unpopulated */
@@ -2669,15 +2690,6 @@ void split_page(struct page *page, unsigned int order)
         VM_BUG_ON_PAGE(PageCompound(page), page);
         VM_BUG_ON_PAGE(!page_count(page), page);
  
-#ifdef CONFIG_KMEMCHECK
-       /*
-        * Split shadow pages too, because free(page[0]) would
-        * otherwise free the whole shadow.
-        */
-       if (kmemcheck_page_is_tracked(page))
-               split_page(virt_to_page(page[0].shadow), order);
-#endif
-
         for (i = 1; i < (1 << order); i++)
                 set_page_refcounted(page + i);
         split_page_owner(page, order);
@@ -4223,9 +4235,6 @@ out:
                 page = NULL;
         }
  
-       if (kmemcheck_enabled && page)
-               kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
         trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
  
         return page;
@@ -6207,6 +6216,44 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
         free_area_init_core(pgdat);
  }
  
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Only struct pages that are backed by physical memory are zeroed and
+ * initialized by going through __init_single_page(). But, there are some
+ * struct pages which are reserved in memblock allocator and their fields
+ * may be accessed (for example page_to_pfn() on some configuration accesses
+ * flags). We must explicitly zero those struct pages.
+ */
+void __paginginit zero_resv_unavail(void)
+{
+       phys_addr_t start, end;
+       unsigned long pfn;
+       u64 i, pgcnt;
+
+       /*
+        * Loop through ranges that are reserved, but do not have reported
+        * physical memory backing.
+        */
+       pgcnt = 0;
+       for_each_resv_unavail_range(i, &start, &end) {
+               for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+                       mm_zero_struct_page(pfn_to_page(pfn));
+                       pgcnt++;
+               }
+       }
+
+       /*
+        * Struct pages that do not have backing memory. This could be because
+        * firmware is using some of this memory, or for some other reasons.
+        * Once memblock is changed so such behaviour is not allowed: i.e.
+        * list of "reserved" memory must be a subset of list of "memory", then
+        * this code can be removed.
+        */
+       if (pgcnt)
+               pr_info("Reserved but unavailable: %lld pages", pgcnt);
+}
+#endif /* CONFIG_HAVE_MEMBLOCK */
+
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  
  #if MAX_NUMNODES > 1
@@ -6630,6 +6677,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                         node_set_state(nid, N_MEMORY);
                 check_for_memory(pgdat, nid);
         }
+       zero_resv_unavail();
  }
  
  static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6793,6 +6841,7 @@ void __init free_area_init(unsigned long *zones_size)
  {
         free_area_init_node(0, zones_size,
                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+       zero_resv_unavail();
  }
  
  static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7305,18 +7354,17 @@ void *__init alloc_large_system_hash(const char *tablename,
  
         log2qty = ilog2(numentries);
  
-       /*
-        * memblock allocator returns zeroed memory already, so HASH_ZERO is
-        * currently not used when HASH_EARLY is specified.
-        */
         gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
         do {
                 size = bucketsize << log2qty;
-               if (flags & HASH_EARLY)
-                       table = memblock_virt_alloc_nopanic(size, 0);
-               else if (hashdist)
+               if (flags & HASH_EARLY) {
+                       if (flags & HASH_ZERO)
+                               table = memblock_virt_alloc_nopanic(size, 0);
+                       else
+                               table = memblock_virt_alloc_raw(size, 0);
+               } else if (hashdist) {
                         table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-               else {
+               } else {
                         /*
                          * If bucketsize is not a power-of-two, we may free
                          * some pages at the end of hash table which