mm: take memory hotplug lock within numa_zonelist_order_handler()
[sfrench/cifs-2.6.git] / mm / page_alloc.c
index bd65b60939b611e18d3772d26079421d0b4eb495..fc32aa81f3593537cc2b11d5f63b5c5f517097a4 100644 (file)
@@ -2206,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
  * list of requested migratetype, possibly along with other pages from the same
  * block, depending on fragmentation avoidance heuristics. Returns true if
  * fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
  */
 static inline bool
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
        struct free_area *area;
-       unsigned int current_order;
+       int current_order;
        struct page *page;
        int fallback_mt;
        bool can_steal;
 
-       /* Find the largest possible block of pages in the other list */
-       for (current_order = MAX_ORDER-1;
-                               current_order >= order && current_order <= MAX_ORDER-1;
+       /*
+        * Find the largest available free page in the other list. This roughly
+        * approximates finding the pageblock with the most free pages, which
+        * would be too costly to do exactly.
+        */
+       for (current_order = MAX_ORDER - 1; current_order >= order;
                                --current_order) {
                area = &(zone->free_area[current_order]);
                fallback_mt = find_suitable_fallback(area, current_order,
@@ -2226,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                if (fallback_mt == -1)
                        continue;
 
-               page = list_first_entry(&area->free_list[fallback_mt],
-                                               struct page, lru);
+               /*
+                * We cannot steal all free pages from the pageblock and the
+                * requested migratetype is movable. In that case it's better to
+                * steal and split the smallest available page instead of the
+                * largest available page, because even if the next movable
+                * allocation falls back into a different pageblock than this
+                * one, it won't cause permanent fragmentation.
+                */
+               if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+                                       && current_order > order)
+                       goto find_smallest;
 
-               steal_suitable_fallback(zone, page, start_migratetype,
-                                                               can_steal);
+               goto do_steal;
+       }
 
-               trace_mm_page_alloc_extfrag(page, order, current_order,
-                       start_migratetype, fallback_mt);
+       return false;
 
-               return true;
+find_smallest:
+       for (current_order = order; current_order < MAX_ORDER;
+                                                       current_order++) {
+               area = &(zone->free_area[current_order]);
+               fallback_mt = find_suitable_fallback(area, current_order,
+                               start_migratetype, false, &can_steal);
+               if (fallback_mt != -1)
+                       break;
        }
 
-       return false;
+       /*
+        * This should not happen - we already found a suitable fallback
+        * when looking for the largest page.
+        */
+       VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+       page = list_first_entry(&area->free_list[fallback_mt],
+                                                       struct page, lru);
+
+       steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+
+       trace_mm_page_alloc_extfrag(page, order, current_order,
+               start_migratetype, fallback_mt);
+
+       return true;
+
 }
 
 /*
@@ -3246,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        /* The OOM killer will not help higher order allocs */
        if (order > PAGE_ALLOC_COSTLY_ORDER)
                goto out;
+       /*
+        * We have already exhausted all our reclaim opportunities without any
+        * success so it is time to admit defeat. We will skip the OOM killer
+        * because it is very likely that the caller has a more reasonable
+        * fallback than shooting a random task.
+        */
+       if (gfp_mask & __GFP_RETRY_MAYFAIL)
+               goto out;
        /* The OOM killer does not needlessly kill tasks for lowmem */
        if (ac->high_zoneidx < ZONE_NORMAL)
                goto out;
@@ -3375,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
        }
 
        /*
-        * !costly requests are much more important than __GFP_REPEAT
+        * !costly requests are much more important than __GFP_RETRY_MAYFAIL
         * costly ones because they are de facto nofail and invoke OOM
         * killer to move on while costly can fail and users are ready
         * to cope with that. 1/4 retries is rather arbitrary but we
@@ -3882,9 +3928,9 @@ retry:
 
        /*
         * Do not retry costly high order allocations unless they are
-        * __GFP_REPEAT
+        * __GFP_RETRY_MAYFAIL
         */
-       if (costly_order && !(gfp_mask & __GFP_REPEAT))
+       if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
                goto nopage;
 
        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4845,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
                                NUMA_ZONELIST_ORDER_LEN);
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
+                       mem_hotplug_begin();
                        mutex_lock(&zonelists_mutex);
                        build_all_zonelists(NULL, NULL);
                        mutex_unlock(&zonelists_mutex);
+                       mem_hotplug_done();
                }
        }
 out:
@@ -5240,7 +5288,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 #endif
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-               stop_machine(__build_all_zonelists, pgdat, NULL);
+               stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();