mm: take memory hotplug lock within numa_zonelist_order_handler()

[sfrench/cifs-2.6.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index bd65b60939b611e18d3772d26079421d0b4eb495..fc32aa81f3593537cc2b11d5f63b5c5f517097a4 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2206,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
   * list of requested migratetype, possibly along with other pages from the same
   * block, depending on fragmentation avoidance heuristics. Returns true if
   * fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
   */
  static inline bool
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
  {
         struct free_area *area;
-       unsigned int current_order;
+       int current_order;
         struct page *page;
         int fallback_mt;
         bool can_steal;
  
-       /* Find the largest possible block of pages in the other list */
-       for (current_order = MAX_ORDER-1;
-                               current_order >= order && current_order <= MAX_ORDER-1;
+       /*
+        * Find the largest available free page in the other list. This roughly
+        * approximates finding the pageblock with the most free pages, which
+        * would be too costly to do exactly.
+        */
+       for (current_order = MAX_ORDER - 1; current_order >= order;
                                 --current_order) {
                 area = &(zone->free_area[current_order]);
                 fallback_mt = find_suitable_fallback(area, current_order,
@@ -2226,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                 if (fallback_mt == -1)
                         continue;
  
-               page = list_first_entry(&area->free_list[fallback_mt],
-                                               struct page, lru);
+               /*
+                * We cannot steal all free pages from the pageblock and the
+                * requested migratetype is movable. In that case it's better to
+                * steal and split the smallest available page instead of the
+                * largest available page, because even if the next movable
+                * allocation falls back into a different pageblock than this
+                * one, it won't cause permanent fragmentation.
+                */
+               if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+                                       && current_order > order)
+                       goto find_smallest;
  
-               steal_suitable_fallback(zone, page, start_migratetype,
-                                                               can_steal);
+               goto do_steal;
+       }
  
-               trace_mm_page_alloc_extfrag(page, order, current_order,
-                       start_migratetype, fallback_mt);
+       return false;
  
-               return true;
+find_smallest:
+       for (current_order = order; current_order < MAX_ORDER;
+                                                       current_order++) {
+               area = &(zone->free_area[current_order]);
+               fallback_mt = find_suitable_fallback(area, current_order,
+                               start_migratetype, false, &can_steal);
+               if (fallback_mt != -1)
+                       break;
         }
  
-       return false;
+       /*
+        * This should not happen - we already found a suitable fallback
+        * when looking for the largest page.
+        */
+       VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+       page = list_first_entry(&area->free_list[fallback_mt],
+                                                       struct page, lru);
+
+       steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+
+       trace_mm_page_alloc_extfrag(page, order, current_order,
+               start_migratetype, fallback_mt);
+
+       return true;
+
  }
  
  /*
@@ -3246,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         /* The OOM killer will not help higher order allocs */
         if (order > PAGE_ALLOC_COSTLY_ORDER)
                 goto out;
+       /*
+        * We have already exhausted all our reclaim opportunities without any
+        * success so it is time to admit defeat. We will skip the OOM killer
+        * because it is very likely that the caller has a more reasonable
+        * fallback than shooting a random task.
+        */
+       if (gfp_mask & __GFP_RETRY_MAYFAIL)
+               goto out;
         /* The OOM killer does not needlessly kill tasks for lowmem */
         if (ac->high_zoneidx < ZONE_NORMAL)
                 goto out;
@@ -3375,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
         }
  
         /*
-        * !costly requests are much more important than __GFP_REPEAT
+        * !costly requests are much more important than __GFP_RETRY_MAYFAIL
          * costly ones because they are de facto nofail and invoke OOM
          * killer to move on while costly can fail and users are ready
          * to cope with that. 1/4 retries is rather arbitrary but we
@@ -3882,9 +3928,9 @@ retry:
  
         /*
          * Do not retry costly high order allocations unless they are
-        * __GFP_REPEAT
+        * __GFP_RETRY_MAYFAIL
          */
-       if (costly_order && !(gfp_mask & __GFP_REPEAT))
+       if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
                 goto nopage;
  
         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4845,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
                                 NUMA_ZONELIST_ORDER_LEN);
                         user_zonelist_order = oldval;
                 } else if (oldval != user_zonelist_order) {
+                       mem_hotplug_begin();
                         mutex_lock(&zonelists_mutex);
                         build_all_zonelists(NULL, NULL);
                         mutex_unlock(&zonelists_mutex);
+                       mem_hotplug_done();
                 }
         }
  out:
@@ -5240,7 +5288,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
  #endif
                 /* we have to stop all cpus to guarantee there is no user
                    of zonelist */
-               stop_machine(__build_all_zonelists, pgdat, NULL);
+               stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
                 /* cpuset refresh routine should be here */
         }
         vm_total_pages = nr_free_pagecache_pages();