mm/compaction: pass pgdat to too_many_isolated() instead of zone
[sfrench/cifs-2.6.git] / mm / compaction.c
index ef29490b0f462349ec90b8672448f80627ef3af6..f171a83707ced436bb2bd4508060a6cd45a95905 100644 (file)
@@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist)
        return high_pfn;
 }
 
-static void map_pages(struct list_head *list)
+static void split_map_pages(struct list_head *list)
 {
        unsigned int i, order, nr_pages;
        struct page *page, *next;
@@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page)
        return false;
 }
 
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+                                                       bool check_target)
+{
+       struct page *page = pfn_to_online_page(pfn);
+       struct page *end_page;
+       unsigned long block_pfn;
+
+       if (!page)
+               return false;
+       if (zone != page_zone(page))
+               return false;
+       if (pageblock_skip_persistent(page))
+               return false;
+
+       /*
+        * If skip is already cleared do no further checking once the
+        * restart points have been set.
+        */
+       if (check_source && check_target && !get_pageblock_skip(page))
+               return true;
+
+       /*
+        * If clearing skip for the target scanner, do not select a
+        * non-movable pageblock as the starting point.
+        */
+       if (!check_source && check_target &&
+           get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+               return false;
+
+       /*
+        * Only clear the hint if a sample indicates there is either a
+        * free page or an LRU page in the block. One or other condition
+        * is necessary for the block to be a migration source/target.
+        */
+       block_pfn = pageblock_start_pfn(pfn);
+       pfn = max(block_pfn, zone->zone_start_pfn);
+       page = pfn_to_page(pfn);
+       if (zone != page_zone(page))
+               return false;
+       pfn = block_pfn + pageblock_nr_pages;
+       pfn = min(pfn, zone_end_pfn(zone));
+       end_page = pfn_to_page(pfn);
+
+       do {
+               if (pfn_valid_within(pfn)) {
+                       if (check_source && PageLRU(page)) {
+                               clear_pageblock_skip(page);
+                               return true;
+                       }
+
+                       if (check_target && PageBuddy(page)) {
+                               clear_pageblock_skip(page);
+                               return true;
+                       }
+               }
+
+               page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+               pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+       } while (page < end_page);
+
+       return false;
+}
+
 /*
  * This function is called to clear all cached information on pageblocks that
  * should be skipped for page isolation when the migrate and free page scanner
@@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page)
  */
 static void __reset_isolation_suitable(struct zone *zone)
 {
-       unsigned long start_pfn = zone->zone_start_pfn;
-       unsigned long end_pfn = zone_end_pfn(zone);
-       unsigned long pfn;
+       unsigned long migrate_pfn = zone->zone_start_pfn;
+       unsigned long free_pfn = zone_end_pfn(zone);
+       unsigned long reset_migrate = free_pfn;
+       unsigned long reset_free = migrate_pfn;
+       bool source_set = false;
+       bool free_set = false;
+
+       if (!zone->compact_blockskip_flush)
+               return;
 
        zone->compact_blockskip_flush = false;
 
-       /* Walk the zone and mark every pageblock as suitable for isolation */
-       for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
-               struct page *page;
-
+       /*
+        * Walk the zone and update pageblock skip information. Source looks
+        * for PageLRU while target looks for PageBuddy. When the scanner
+        * is found, both PageBuddy and PageLRU are checked as the pageblock
+        * is suitable as both source and target.
+        */
+       for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+                                       free_pfn -= pageblock_nr_pages) {
                cond_resched();
 
-               page = pfn_to_online_page(pfn);
-               if (!page)
-                       continue;
-               if (zone != page_zone(page))
-                       continue;
-               if (pageblock_skip_persistent(page))
-                       continue;
+               /* Update the migrate PFN */
+               if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+                   migrate_pfn < reset_migrate) {
+                       source_set = true;
+                       reset_migrate = migrate_pfn;
+                       zone->compact_init_migrate_pfn = reset_migrate;
+                       zone->compact_cached_migrate_pfn[0] = reset_migrate;
+                       zone->compact_cached_migrate_pfn[1] = reset_migrate;
+               }
 
-               clear_pageblock_skip(page);
+               /* Update the free PFN */
+               if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+                   free_pfn > reset_free) {
+                       free_set = true;
+                       reset_free = free_pfn;
+                       zone->compact_init_free_pfn = reset_free;
+                       zone->compact_cached_free_pfn = reset_free;
+               }
        }
 
-       reset_cached_positions(zone);
+       /* Leave no distance if no suitable block was reset */
+       if (reset_migrate >= reset_free) {
+               zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+               zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+               zone->compact_cached_free_pfn = free_pfn;
+       }
 }
 
 void reset_isolation_suitable(pg_data_t *pgdat)
@@ -285,16 +373,54 @@ void reset_isolation_suitable(pg_data_t *pgdat)
        }
 }
 
+/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+                                                       unsigned long pfn)
+{
+       bool skip;
+
+       /* Do no update if skip hint is being ignored */
+       if (cc->ignore_skip_hint)
+               return false;
+
+       if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+               return false;
+
+       skip = get_pageblock_skip(page);
+       if (!skip && !cc->no_set_skip_hint)
+               set_pageblock_skip(page);
+
+       return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+       struct zone *zone = cc->zone;
+
+       pfn = pageblock_end_pfn(pfn);
+
+       /* Set for isolation rather than compaction */
+       if (cc->no_set_skip_hint)
+               return;
+
+       if (pfn > zone->compact_cached_migrate_pfn[0])
+               zone->compact_cached_migrate_pfn[0] = pfn;
+       if (cc->mode != MIGRATE_ASYNC &&
+           pfn > zone->compact_cached_migrate_pfn[1])
+               zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
 /*
  * If no pages were isolated then mark this pageblock to be skipped in the
  * future. The information is later cleared by __reset_isolation_suitable().
  */
 static void update_pageblock_skip(struct compact_control *cc,
-                       struct page *page, unsigned long nr_isolated,
-                       bool migrate_scanner)
+                       struct page *page, unsigned long pfn)
 {
        struct zone *zone = cc->zone;
-       unsigned long pfn;
 
        if (cc->no_set_skip_hint)
                return;
@@ -302,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc,
        if (!page)
                return;
 
-       if (nr_isolated)
-               return;
-
        set_pageblock_skip(page);
 
-       pfn = page_to_pfn(page);
-
        /* Update where async and sync compaction should restart */
-       if (migrate_scanner) {
-               if (pfn > zone->compact_cached_migrate_pfn[0])
-                       zone->compact_cached_migrate_pfn[0] = pfn;
-               if (cc->mode != MIGRATE_ASYNC &&
-                   pfn > zone->compact_cached_migrate_pfn[1])
-                       zone->compact_cached_migrate_pfn[1] = pfn;
-       } else {
-               if (pfn < zone->compact_cached_free_pfn)
-                       zone->compact_cached_free_pfn = pfn;
-       }
+       if (pfn < zone->compact_cached_free_pfn)
+               zone->compact_cached_free_pfn = pfn;
 }
 #else
 static inline bool isolation_suitable(struct compact_control *cc,
@@ -334,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page)
 }
 
 static inline void update_pageblock_skip(struct compact_control *cc,
-                       struct page *page, unsigned long nr_isolated,
-                       bool migrate_scanner)
+                       struct page *page, unsigned long pfn)
+{
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+                                                       unsigned long pfn)
 {
+       return false;
 }
 #endif /* CONFIG_COMPACTION */
 
 /*
  * Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. For async compaction, back out if the lock cannot
- * be taken immediately. For sync compaction, spin on the lock if needed.
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
  *
- * Returns true if the lock is held
- * Returns false if the lock is not held and compaction should abort
+ * Always returns true which makes it easier to track lock state in callers.
  */
-static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
                                                struct compact_control *cc)
 {
-       if (cc->mode == MIGRATE_ASYNC) {
-               if (!spin_trylock_irqsave(lock, *flags)) {
-                       cc->contended = true;
-                       return false;
-               }
-       } else {
-               spin_lock_irqsave(lock, *flags);
+       /* Track if the lock is contended in async mode */
+       if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+               if (spin_trylock_irqsave(lock, *flags))
+                       return true;
+
+               cc->contended = true;
        }
 
+       spin_lock_irqsave(lock, *flags);
        return true;
 }
 
@@ -391,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
                return true;
        }
 
-       if (need_resched()) {
-               if (cc->mode == MIGRATE_ASYNC) {
-                       cc->contended = true;
-                       return true;
-               }
-               cond_resched();
-       }
-
-       return false;
-}
-
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_unlock_should_abort() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
-{
-       /* async compaction aborts if contended */
-       if (need_resched()) {
-               if (cc->mode == MIGRATE_ASYNC) {
-                       cc->contended = true;
-                       return true;
-               }
-
-               cond_resched();
-       }
+       cond_resched();
 
        return false;
 }
@@ -435,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                                unsigned long *start_pfn,
                                unsigned long end_pfn,
                                struct list_head *freelist,
+                               unsigned int stride,
                                bool strict)
 {
        int nr_scanned = 0, total_isolated = 0;
-       struct page *cursor, *valid_page = NULL;
+       struct page *cursor;
        unsigned long flags = 0;
        bool locked = false;
        unsigned long blockpfn = *start_pfn;
        unsigned int order;
 
+       /* Strict mode is for isolation, speed is secondary */
+       if (strict)
+               stride = 1;
+
        cursor = pfn_to_page(blockpfn);
 
        /* Isolate free pages. */
-       for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+       for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
                int isolated;
                struct page *page = cursor;
 
@@ -465,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                if (!pfn_valid_within(blockpfn))
                        goto isolate_fail;
 
-               if (!valid_page)
-                       valid_page = page;
-
                /*
                 * For compound pages such as THP and hugetlbfs, we can save
                 * potentially a lot of iterations if we skip them at once.
@@ -495,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                 * recheck as well.
                 */
                if (!locked) {
-                       /*
-                        * The zone lock must be held to isolate freepages.
-                        * Unfortunately this is a very coarse lock and can be
-                        * heavily contended if there are parallel allocations
-                        * or parallel compactions. For async compaction do not
-                        * spin on the lock and we acquire the lock as late as
-                        * possible.
-                        */
-                       locked = compact_trylock_irqsave(&cc->zone->lock,
+                       locked = compact_lock_irqsave(&cc->zone->lock,
                                                                &flags, cc);
-                       if (!locked)
-                               break;
 
                        /* Recheck this is a buddy page under lock */
                        if (!PageBuddy(page))
@@ -565,10 +650,6 @@ isolate_fail:
        if (strict && blockpfn < end_pfn)
                total_isolated = 0;
 
-       /* Update the pageblock-skip if the whole pageblock was scanned */
-       if (blockpfn == end_pfn)
-               update_pageblock_skip(cc, valid_page, total_isolated, false);
-
        cc->total_free_scanned += nr_scanned;
        if (total_isolated)
                count_compact_events(COMPACTISOLATED, total_isolated);
@@ -626,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc,
                        break;
 
                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-                                               block_end_pfn, &freelist, true);
+                                       block_end_pfn, &freelist, 0, true);
 
                /*
                 * In strict mode, isolate_freepages_block() returns 0 if
@@ -644,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc,
        }
 
        /* __isolate_free_page() does not map the pages */
-       map_pages(&freelist);
+       split_map_pages(&freelist);
 
        if (pfn < end_pfn) {
                /* Loop terminated early, cleanup. */
@@ -657,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc,
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(struct zone *zone)
+static bool too_many_isolated(pg_data_t *pgdat)
 {
        unsigned long active, inactive, isolated;
 
-       inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
-                       node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
-       active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
-                       node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
-       isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
-                       node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
+       inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
+                       node_page_state(pgdat, NR_INACTIVE_ANON);
+       active = node_page_state(pgdat, NR_ACTIVE_FILE) +
+                       node_page_state(pgdat, NR_ACTIVE_ANON);
+       isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
+                       node_page_state(pgdat, NR_ISOLATED_ANON);
 
        return isolated > (inactive + active) / 2;
 }
@@ -693,7 +774,7 @@ static unsigned long
 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        unsigned long end_pfn, isolate_mode_t isolate_mode)
 {
-       struct zone *zone = cc->zone;
+       pg_data_t *pgdat = cc->zone->zone_pgdat;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct lruvec *lruvec;
        unsigned long flags = 0;
@@ -702,13 +783,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        unsigned long start_pfn = low_pfn;
        bool skip_on_failure = false;
        unsigned long next_skip_pfn = 0;
+       bool skip_updated = false;
 
        /*
         * Ensure that there are not too many pages isolated from the LRU
         * list by either parallel reclaimers or compaction. If there are,
         * delay for some time until fewer pages are isolated
         */
-       while (unlikely(too_many_isolated(zone))) {
+       while (unlikely(too_many_isolated(pgdat))) {
                /* async migration should just abort */
                if (cc->mode == MIGRATE_ASYNC)
                        return 0;
@@ -719,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        return 0;
        }
 
-       if (compact_should_abort(cc))
-               return 0;
+       cond_resched();
 
        if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
                skip_on_failure = true;
@@ -758,8 +839,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 * if contended.
                 */
                if (!(low_pfn % SWAP_CLUSTER_MAX)
-                   && compact_unlock_should_abort(zone_lru_lock(zone), flags,
-                                                               &locked, cc))
+                   && compact_unlock_should_abort(&pgdat->lru_lock,
+                                           flags, &locked, cc))
                        break;
 
                if (!pfn_valid_within(low_pfn))
@@ -768,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
                page = pfn_to_page(low_pfn);
 
-               if (!valid_page)
+               /*
+                * Check if the pageblock has already been marked skipped.
+                * Only the aligned PFN is checked as the caller isolates
+                * COMPACT_CLUSTER_MAX at a time so the second call must
+                * not falsely conclude that the block should be skipped.
+                */
+               if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+                       if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+                               low_pfn = end_pfn;
+                               goto isolate_abort;
+                       }
                        valid_page = page;
+               }
 
                /*
                 * Skip if free. We read page order here without zone lock
@@ -818,7 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        if (unlikely(__PageMovable(page)) &&
                                        !PageIsolated(page)) {
                                if (locked) {
-                                       spin_unlock_irqrestore(zone_lru_lock(zone),
+                                       spin_unlock_irqrestore(&pgdat->lru_lock,
                                                                        flags);
                                        locked = false;
                                }
@@ -848,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
                /* If we already hold the lock, we can skip some rechecking */
                if (!locked) {
-                       locked = compact_trylock_irqsave(zone_lru_lock(zone),
+                       locked = compact_lock_irqsave(&pgdat->lru_lock,
                                                                &flags, cc);
-                       if (!locked)
-                               break;
+
+                       /* Try get exclusive access under lock */
+                       if (!skip_updated) {
+                               skip_updated = true;
+                               if (test_and_set_skip(cc, page, low_pfn))
+                                       goto isolate_abort;
+                       }
 
                        /* Recheck PageLRU and PageCompound under lock */
                        if (!PageLRU(page))
@@ -868,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        }
                }
 
-               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
                /* Try isolate the page */
                if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -887,16 +984,13 @@ isolate_success:
                nr_isolated++;
 
                /*
-                * Record where we could have freed pages by migration and not
-                * yet flushed them to buddy allocator.
-                * - this is the lowest page that was isolated and likely be
-                * then freed by migration.
+                * Avoid isolating too much unless this block is being
+                * rescanned (e.g. dirty/writeback pages, parallel allocation)
+                * or a lock is contended. For contention, isolate quickly to
+                * potentially remove one source of contention.
                 */
-               if (!cc->last_migrated_pfn)
-                       cc->last_migrated_pfn = low_pfn;
-
-               /* Avoid isolating too much */
-               if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+               if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+                   !cc->rescan && !cc->contended) {
                        ++low_pfn;
                        break;
                }
@@ -913,12 +1007,11 @@ isolate_fail:
                 */
                if (nr_isolated) {
                        if (locked) {
-                               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+                               spin_unlock_irqrestore(&pgdat->lru_lock, flags);
                                locked = false;
                        }
                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
-                       cc->last_migrated_pfn = 0;
                        nr_isolated = 0;
                }
 
@@ -939,15 +1032,23 @@ isolate_fail:
        if (unlikely(low_pfn > end_pfn))
                low_pfn = end_pfn;
 
+isolate_abort:
        if (locked)
-               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+               spin_unlock_irqrestore(&pgdat->lru_lock, flags);
 
        /*
-        * Update the pageblock-skip information and cached scanner pfn,
-        * if the whole pageblock was scanned without isolating any page.
+        * Updated the cached scanner pfn once the pageblock has been scanned
+        * Pages will either be migrated in which case there is no point
+        * scanning in the near future or migration failed in which case the
+        * failure reason may persist. The block is marked for skipping if
+        * there were no pages isolated in the block or if the block is
+        * rescanned twice in a row.
         */
-       if (low_pfn == end_pfn)
-               update_pageblock_skip(cc, valid_page, nr_isolated, true);
+       if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+               if (valid_page && !skip_updated)
+                       set_pageblock_skip(valid_page);
+               update_cached_migrate(cc, low_pfn);
+       }
 
        trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
                                                nr_scanned, nr_isolated);
@@ -1013,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc,
 {
        int block_mt;
 
+       if (pageblock_skip_persistent(page))
+               return false;
+
        if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
                return true;
 
@@ -1050,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc,
        return false;
 }
 
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+       return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+}
+
 /*
  * Test whether the free scanner has reached the same or lower pageblock than
  * the migration scanner, and compaction should thus terminate.
@@ -1060,6 +1170,248 @@ static inline bool compact_scanners_met(struct compact_control *cc)
                <= (cc->migrate_pfn >> pageblock_order);
 }
 
+/*
+ * Used when scanning for a suitable migration target which scans freelists
+ * in reverse. Reorders the list such as the unscanned pages are scanned
+ * first on the next iteration of the free scanner
+ */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+       LIST_HEAD(sublist);
+
+       if (!list_is_last(freelist, &freepage->lru)) {
+               list_cut_before(&sublist, freelist, &freepage->lru);
+               if (!list_empty(&sublist))
+                       list_splice_tail(&sublist, freelist);
+       }
+}
+
+/*
+ * Similar to move_freelist_head except used by the migration scanner
+ * when scanning forward. It's possible for these list operations to
+ * move against each other if they search the free list exactly in
+ * lockstep.
+ */
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+       LIST_HEAD(sublist);
+
+       if (!list_is_first(freelist, &freepage->lru)) {
+               list_cut_position(&sublist, freelist, &freepage->lru);
+               if (!list_empty(&sublist))
+                       list_splice_tail(&sublist, freelist);
+       }
+}
+
+static void
+fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+{
+       unsigned long start_pfn, end_pfn;
+       struct page *page = pfn_to_page(pfn);
+
+       /* Do not search around if there are enough pages already */
+       if (cc->nr_freepages >= cc->nr_migratepages)
+               return;
+
+       /* Minimise scanning during async compaction */
+       if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
+               return;
+
+       /* Pageblock boundaries */
+       start_pfn = pageblock_start_pfn(pfn);
+       end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+
+       /* Scan before */
+       if (start_pfn != pfn) {
+               isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
+               if (cc->nr_freepages >= cc->nr_migratepages)
+                       return;
+       }
+
+       /* Scan after */
+       start_pfn = pfn + nr_isolated;
+       if (start_pfn != end_pfn)
+               isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+
+       /* Skip this pageblock in the future as it's full or nearly full */
+       if (cc->nr_freepages < cc->nr_migratepages)
+               set_pageblock_skip(page);
+}
+
+/* Search orders in round-robin fashion */
+static int next_search_order(struct compact_control *cc, int order)
+{
+       order--;
+       if (order < 0)
+               order = cc->order - 1;
+
+       /* Search wrapped around? */
+       if (order == cc->search_order) {
+               cc->search_order--;
+               if (cc->search_order < 0)
+                       cc->search_order = cc->order - 1;
+               return -1;
+       }
+
+       return order;
+}
+
+static unsigned long
+fast_isolate_freepages(struct compact_control *cc)
+{
+       unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+       unsigned int nr_scanned = 0;
+       unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+       unsigned long nr_isolated = 0;
+       unsigned long distance;
+       struct page *page = NULL;
+       bool scan_start = false;
+       int order;
+
+       /* Full compaction passes in a negative order */
+       if (cc->order <= 0)
+               return cc->free_pfn;
+
+       /*
+        * If starting the scan, use a deeper search and use the highest
+        * PFN found if a suitable one is not found.
+        */
+       if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
+               limit = pageblock_nr_pages >> 1;
+               scan_start = true;
+       }
+
+       /*
+        * Preferred point is in the top quarter of the scan space but take
+        * a pfn from the top half if the search is problematic.
+        */
+       distance = (cc->free_pfn - cc->migrate_pfn);
+       low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
+       min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
+
+       if (WARN_ON_ONCE(min_pfn > low_pfn))
+               low_pfn = min_pfn;
+
+       /*
+        * Search starts from the last successful isolation order or the next
+        * order to search after a previous failure
+        */
+       cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
+
+       for (order = cc->search_order;
+            !page && order >= 0;
+            order = next_search_order(cc, order)) {
+               struct free_area *area = &cc->zone->free_area[order];
+               struct list_head *freelist;
+               struct page *freepage;
+               unsigned long flags;
+               unsigned int order_scanned = 0;
+
+               if (!area->nr_free)
+                       continue;
+
+               spin_lock_irqsave(&cc->zone->lock, flags);
+               freelist = &area->free_list[MIGRATE_MOVABLE];
+               list_for_each_entry_reverse(freepage, freelist, lru) {
+                       unsigned long pfn;
+
+                       order_scanned++;
+                       nr_scanned++;
+                       pfn = page_to_pfn(freepage);
+
+                       if (pfn >= highest)
+                               highest = pageblock_start_pfn(pfn);
+
+                       if (pfn >= low_pfn) {
+                               cc->fast_search_fail = 0;
+                               cc->search_order = order;
+                               page = freepage;
+                               break;
+                       }
+
+                       if (pfn >= min_pfn && pfn > high_pfn) {
+                               high_pfn = pfn;
+
+                               /* Shorten the scan if a candidate is found */
+                               limit >>= 1;
+                       }
+
+                       if (order_scanned >= limit)
+                               break;
+               }
+
+               /* Use a minimum pfn if a preferred one was not found */
+               if (!page && high_pfn) {
+                       page = pfn_to_page(high_pfn);
+
+                       /* Update freepage for the list reorder below */
+                       freepage = page;
+               }
+
+               /* Reorder to so a future search skips recent pages */
+               move_freelist_head(freelist, freepage);
+
+               /* Isolate the page if available */
+               if (page) {
+                       if (__isolate_free_page(page, order)) {
+                               set_page_private(page, order);
+                               nr_isolated = 1 << order;
+                               cc->nr_freepages += nr_isolated;
+                               list_add_tail(&page->lru, &cc->freepages);
+                               count_compact_events(COMPACTISOLATED, nr_isolated);
+                       } else {
+                               /* If isolation fails, abort the search */
+                               order = -1;
+                               page = NULL;
+                       }
+               }
+
+               spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+               /*
+                * Smaller scan on next order so the total scan ig related
+                * to freelist_scan_limit.
+                */
+               if (order_scanned >= limit)
+                       limit = min(1U, limit >> 1);
+       }
+
+       if (!page) {
+               cc->fast_search_fail++;
+               if (scan_start) {
+                       /*
+                        * Use the highest PFN found above min. If one was
+                        * not found, be pessemistic for direct compaction
+                        * and use the min mark.
+                        */
+                       if (highest) {
+                               page = pfn_to_page(highest);
+                               cc->free_pfn = highest;
+                       } else {
+                               if (cc->direct_compaction) {
+                                       page = pfn_to_page(min_pfn);
+                                       cc->free_pfn = min_pfn;
+                               }
+                       }
+               }
+       }
+
+       if (highest && highest >= cc->zone->compact_cached_free_pfn) {
+               highest -= pageblock_nr_pages;
+               cc->zone->compact_cached_free_pfn = highest;
+       }
+
+       cc->total_free_scanned += nr_scanned;
+       if (!page)
+               return cc->free_pfn;
+
+       low_pfn = page_to_pfn(page);
+       fast_isolate_around(cc, low_pfn, nr_isolated);
+       return low_pfn;
+}
+
 /*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
@@ -1073,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc)
        unsigned long block_end_pfn;    /* end of current pageblock */
        unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
        struct list_head *freelist = &cc->freepages;
+       unsigned int stride;
+
+       /* Try a small search of the free lists for a candidate */
+       isolate_start_pfn = fast_isolate_freepages(cc);
+       if (cc->nr_freepages)
+               goto splitmap;
 
        /*
         * Initialise the free scanner. The starting point is where we last
@@ -1086,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc)
         * is using.
         */
        isolate_start_pfn = cc->free_pfn;
-       block_start_pfn = pageblock_start_pfn(cc->free_pfn);
+       block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
        block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
                                                zone_end_pfn(zone));
        low_pfn = pageblock_end_pfn(cc->migrate_pfn);
+       stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
 
        /*
         * Isolate free pages until enough are available to migrate the
@@ -1100,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc)
                                block_end_pfn = block_start_pfn,
                                block_start_pfn -= pageblock_nr_pages,
                                isolate_start_pfn = block_start_pfn) {
+               unsigned long nr_isolated;
+
                /*
                 * This can iterate a massively long zone without finding any
-                * suitable migration targets, so periodically check if we need
-                * to schedule, or even abort async compaction.
+                * suitable migration targets, so periodically check resched.
                 */
-               if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
-                                               && compact_should_abort(cc))
-                       break;
+               if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+                       cond_resched();
 
                page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
                                                                        zone);
@@ -1123,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc)
                        continue;
 
                /* Found a block suitable for isolating free pages from. */
-               isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
-                                       freelist, false);
+               nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+                                       block_end_pfn, freelist, stride, false);
 
-               /*
-                * If we isolated enough freepages, or aborted due to lock
-                * contention, terminate.
-                */
-               if ((cc->nr_freepages >= cc->nr_migratepages)
-                                                       || cc->contended) {
+               /* Update the skip hint if the full pageblock was scanned */
+               if (isolate_start_pfn == block_end_pfn)
+                       update_pageblock_skip(cc, page, block_start_pfn);
+
+               /* Are enough freepages isolated? */
+               if (cc->nr_freepages >= cc->nr_migratepages) {
                        if (isolate_start_pfn >= block_end_pfn) {
                                /*
                                 * Restart at previous pageblock if more
@@ -1148,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc)
                         */
                        break;
                }
-       }
 
-       /* __isolate_free_page() does not map the pages */
-       map_pages(freelist);
+               /* Adjust stride depending on isolation */
+               if (nr_isolated) {
+                       stride = 1;
+                       continue;
+               }
+               stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
+       }
 
        /*
         * Record where the free scanner will restart next time. Either we
@@ -1160,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc)
         * and the loop terminated due to isolate_start_pfn < low_pfn
         */
        cc->free_pfn = isolate_start_pfn;
+
+splitmap:
+       /* __isolate_free_page() does not map the pages */
+       split_map_pages(freelist);
 }
 
 /*
@@ -1172,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage,
        struct compact_control *cc = (struct compact_control *)data;
        struct page *freepage;
 
-       /*
-        * Isolate free pages if necessary, and if we are not aborting due to
-        * contention.
-        */
        if (list_empty(&cc->freepages)) {
-               if (!cc->contended)
-                       isolate_freepages(cc);
+               isolate_freepages(cc);
 
                if (list_empty(&cc->freepages))
                        return NULL;
@@ -1217,6 +1579,147 @@ typedef enum {
  */
 int sysctl_compact_unevictable_allowed __read_mostly = 1;
 
+static inline void
+update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+{
+       if (cc->fast_start_pfn == ULONG_MAX)
+               return;
+
+       if (!cc->fast_start_pfn)
+               cc->fast_start_pfn = pfn;
+
+       cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
+}
+
+static inline unsigned long
+reinit_migrate_pfn(struct compact_control *cc)
+{
+       if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
+               return cc->migrate_pfn;
+
+       cc->migrate_pfn = cc->fast_start_pfn;
+       cc->fast_start_pfn = ULONG_MAX;
+
+       return cc->migrate_pfn;
+}
+
+/*
+ * Briefly search the free lists for a migration source that already has
+ * some free pages to reduce the number of pages that need migration
+ * before a pageblock is free.
+ */
+static unsigned long fast_find_migrateblock(struct compact_control *cc)
+{
+       unsigned int limit = freelist_scan_limit(cc);
+       unsigned int nr_scanned = 0;
+       unsigned long distance;
+       unsigned long pfn = cc->migrate_pfn;
+       unsigned long high_pfn;
+       int order;
+
+       /* Skip hints are relied on to avoid repeats on the fast search */
+       if (cc->ignore_skip_hint)
+               return pfn;
+
+       /*
+        * If the migrate_pfn is not at the start of a zone or the start
+        * of a pageblock then assume this is a continuation of a previous
+        * scan restarted due to COMPACT_CLUSTER_MAX.
+        */
+       if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
+               return pfn;
+
+       /*
+        * For smaller orders, just linearly scan as the number of pages
+        * to migrate should be relatively small and does not necessarily
+        * justify freeing up a large block for a small allocation.
+        */
+       if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
+               return pfn;
+
+       /*
+        * Only allow kcompactd and direct requests for movable pages to
+        * quickly clear out a MOVABLE pageblock for allocation. This
+        * reduces the risk that a large movable pageblock is freed for
+        * an unmovable/reclaimable small allocation.
+        */
+       if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
+               return pfn;
+
+       /*
+        * When starting the migration scanner, pick any pageblock within the
+        * first half of the search space. Otherwise try and pick a pageblock
+        * within the first eighth to reduce the chances that a migration
+        * target later becomes a source.
+        */
+       distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
+       if (cc->migrate_pfn != cc->zone->zone_start_pfn)
+               distance >>= 2;
+       high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
+
+       for (order = cc->order - 1;
+            order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+            order--) {
+               struct free_area *area = &cc->zone->free_area[order];
+               struct list_head *freelist;
+               unsigned long flags;
+               struct page *freepage;
+
+               if (!area->nr_free)
+                       continue;
+
+               spin_lock_irqsave(&cc->zone->lock, flags);
+               freelist = &area->free_list[MIGRATE_MOVABLE];
+               list_for_each_entry(freepage, freelist, lru) {
+                       unsigned long free_pfn;
+
+                       nr_scanned++;
+                       free_pfn = page_to_pfn(freepage);
+                       if (free_pfn < high_pfn) {
+                               /*
+                                * Avoid if skipped recently. Ideally it would
+                                * move to the tail but even safe iteration of
+                                * the list assumes an entry is deleted, not
+                                * reordered.
+                                */
+                               if (get_pageblock_skip(freepage)) {
+                                       if (list_is_last(freelist, &freepage->lru))
+                                               break;
+
+                                       continue;
+                               }
+
+                               /* Reorder to so a future search skips recent pages */
+                               move_freelist_tail(freelist, freepage);
+
+                               update_fast_start_pfn(cc, free_pfn);
+                               pfn = pageblock_start_pfn(free_pfn);
+                               cc->fast_search_fail = 0;
+                               set_pageblock_skip(freepage);
+                               break;
+                       }
+
+                       if (nr_scanned >= limit) {
+                               cc->fast_search_fail++;
+                               move_freelist_tail(freelist, freepage);
+                               break;
+                       }
+               }
+               spin_unlock_irqrestore(&cc->zone->lock, flags);
+       }
+
+       cc->total_migrate_scanned += nr_scanned;
+
+       /*
+        * If fast scanning failed then use a cached entry for a page block
+        * that had free pages as the basis for starting a linear scan.
+        */
+       if (pfn == cc->migrate_pfn)
+               pfn = reinit_migrate_pfn(cc);
+
+       return pfn;
+}
+
 /*
  * Isolate all pages that can be migrated from the first suitable block,
  * starting at the block pointed to by the migrate scanner pfn within
@@ -1232,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        const isolate_mode_t isolate_mode =
                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
                (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+       bool fast_find_block;
 
        /*
         * Start at where we last stopped, or beginning of the zone as
-        * initialized by compact_zone()
+        * initialized by compact_zone(). The first failure will use
+        * the lowest PFN as the starting point for linear scanning.
         */
-       low_pfn = cc->migrate_pfn;
+       low_pfn = fast_find_migrateblock(cc);
        block_start_pfn = pageblock_start_pfn(low_pfn);
        if (block_start_pfn < zone->zone_start_pfn)
                block_start_pfn = zone->zone_start_pfn;
 
+       /*
+        * fast_find_migrateblock marks a pageblock skipped so to avoid
+        * the isolation_suitable check below, check whether the fast
+        * search was successful.
+        */
+       fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
+
        /* Only scan within a pageblock boundary */
        block_end_pfn = pageblock_end_pfn(low_pfn);
 
@@ -1250,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
         * Do not cross the free scanner.
         */
        for (; block_end_pfn <= cc->free_pfn;
+                       fast_find_block = false,
                        low_pfn = block_end_pfn,
                        block_start_pfn = block_end_pfn,
                        block_end_pfn += pageblock_nr_pages) {
@@ -1257,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                /*
                 * This can potentially iterate a massively long zone with
                 * many pageblocks unsuitable, so periodically check if we
-                * need to schedule, or even abort async compaction.
+                * need to schedule.
                 */
-               if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
-                                               && compact_should_abort(cc))
-                       break;
+               if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+                       cond_resched();
 
                page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
                                                                        zone);
                if (!page)
                        continue;
 
-               /* If isolation recently failed, do not retry */
-               if (!isolation_suitable(cc, page))
+               /*
+                * If isolation recently failed, do not retry. Only check the
+                * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
+                * to be visited multiple times. Assume skip was checked
+                * before making it "skip" so other compaction instances do
+                * not scan the same block.
+                */
+               if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+                   !fast_find_block && !isolation_suitable(cc, page))
                        continue;
 
                /*
-                * For async compaction, also only scan in MOVABLE blocks.
-                * Async compaction is optimistic to see if the minimum amount
-                * of work satisfies the allocation.
+                * For async compaction, also only scan in MOVABLE blocks
+                * without huge pages. Async compaction is optimistic to see
+                * if the minimum amount of work satisfies the allocation.
+                * The cached PFN is updated as it's possible that all
+                * remaining blocks between source and target are unsuitable
+                * and the compaction scanners fail to meet.
                 */
-               if (!suitable_migration_source(cc, page))
+               if (!suitable_migration_source(cc, page)) {
+                       update_cached_migrate(cc, block_end_pfn);
                        continue;
+               }
 
                /* Perform the isolation */
                low_pfn = isolate_migratepages_block(cc, low_pfn,
                                                block_end_pfn, isolate_mode);
 
-               if (!low_pfn || cc->contended)
+               if (!low_pfn)
                        return ISOLATE_ABORT;
 
                /*
@@ -1310,19 +1834,16 @@ static inline bool is_via_compact_memory(int order)
        return order == -1;
 }
 
-static enum compact_result __compact_finished(struct zone *zone,
-                                               struct compact_control *cc)
+static enum compact_result __compact_finished(struct compact_control *cc)
 {
        unsigned int order;
        const int migratetype = cc->migratetype;
-
-       if (cc->contended || fatal_signal_pending(current))
-               return COMPACT_CONTENDED;
+       int ret;
 
        /* Compaction run completes if the migrate and free scanner meet */
        if (compact_scanners_met(cc)) {
                /* Let the next compaction start anew. */
-               reset_cached_positions(zone);
+               reset_cached_positions(cc->zone);
 
                /*
                 * Mark that the PG_migrate_skip information should be cleared
@@ -1331,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone,
                 * based on an allocation request.
                 */
                if (cc->direct_compaction)
-                       zone->compact_blockskip_flush = true;
+                       cc->zone->compact_blockskip_flush = true;
 
                if (cc->whole_zone)
                        return COMPACT_COMPLETE;
@@ -1342,20 +1863,19 @@ static enum compact_result __compact_finished(struct zone *zone,
        if (is_via_compact_memory(cc->order))
                return COMPACT_CONTINUE;
 
-       if (cc->finishing_block) {
-               /*
-                * We have finished the pageblock, but better check again that
-                * we really succeeded.
-                */
-               if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
-                       cc->finishing_block = false;
-               else
-                       return COMPACT_CONTINUE;
-       }
+       /*
+        * Always finish scanning a pageblock to reduce the possibility of
+        * fallbacks in the future. This is particularly important when
+        * migration source is unmovable/reclaimable but it's not worth
+        * special casing.
+        */
+       if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+               return COMPACT_CONTINUE;
 
        /* Direct compactor: Is a suitable page free? */
+       ret = COMPACT_NO_SUITABLE_PAGE;
        for (order = cc->order; order < MAX_ORDER; order++) {
-               struct free_area *area = &zone->free_area[order];
+               struct free_area *area = &cc->zone->free_area[order];
                bool can_steal;
 
                /* Job done if page is free of the right migratetype */
@@ -1393,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone,
                                return COMPACT_SUCCESS;
                        }
 
-                       cc->finishing_block = true;
-                       return COMPACT_CONTINUE;
+                       ret = COMPACT_CONTINUE;
+                       break;
                }
        }
 
-       return COMPACT_NO_SUITABLE_PAGE;
+       if (cc->contended || fatal_signal_pending(current))
+               ret = COMPACT_CONTENDED;
+
+       return ret;
 }
 
-static enum compact_result compact_finished(struct zone *zone,
-                       struct compact_control *cc)
+static enum compact_result compact_finished(struct compact_control *cc)
 {
        int ret;
 
-       ret = __compact_finished(zone, cc);
-       trace_mm_compaction_finished(zone, cc->order, ret);
+       ret = __compact_finished(cc);
+       trace_mm_compaction_finished(cc->zone, cc->order, ret);
        if (ret == COMPACT_NO_SUITABLE_PAGE)
                ret = COMPACT_CONTINUE;
 
@@ -1534,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
        return false;
 }
 
-static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result
+compact_zone(struct compact_control *cc, struct capture_control *capc)
 {
        enum compact_result ret;
-       unsigned long start_pfn = zone->zone_start_pfn;
-       unsigned long end_pfn = zone_end_pfn(zone);
+       unsigned long start_pfn = cc->zone->zone_start_pfn;
+       unsigned long end_pfn = zone_end_pfn(cc->zone);
+       unsigned long last_migrated_pfn;
        const bool sync = cc->mode != MIGRATE_ASYNC;
+       bool update_cached;
 
        cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
-       ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+       ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
                                                        cc->classzone_idx);
        /* Compaction is likely to fail */
        if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
@@ -1555,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
         * Clear pageblock skip if there were failures recently and compaction
         * is about to be retried after being deferred.
         */
-       if (compaction_restarting(zone, cc->order))
-               __reset_isolation_suitable(zone);
+       if (compaction_restarting(cc->zone, cc->order))
+               __reset_isolation_suitable(cc->zone);
 
        /*
         * Setup to move all movable pages to the end of the zone. Used cached
@@ -1564,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
         * want to compact the whole zone), but check that it is initialised
         * by ensuring the values are within zone boundaries.
         */
+       cc->fast_start_pfn = 0;
        if (cc->whole_zone) {
                cc->migrate_pfn = start_pfn;
                cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
        } else {
-               cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
-               cc->free_pfn = zone->compact_cached_free_pfn;
+               cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
+               cc->free_pfn = cc->zone->compact_cached_free_pfn;
                if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
                        cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
-                       zone->compact_cached_free_pfn = cc->free_pfn;
+                       cc->zone->compact_cached_free_pfn = cc->free_pfn;
                }
                if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
                        cc->migrate_pfn = start_pfn;
-                       zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
-                       zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+                       cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+                       cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
                }
 
-               if (cc->migrate_pfn == start_pfn)
+               if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
                        cc->whole_zone = true;
        }
 
-       cc->last_migrated_pfn = 0;
+       last_migrated_pfn = 0;
+
+       /*
+        * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
+        * the basis that some migrations will fail in ASYNC mode. However,
+        * if the cached PFNs match and pageblocks are skipped due to having
+        * no isolation candidates, then the sync state does not matter.
+        * Until a pageblock with isolation candidates is found, keep the
+        * cached PFNs in sync to avoid revisiting the same blocks.
+        */
+       update_cached = !sync &&
+               cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
 
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync);
 
        migrate_prep_local();
 
-       while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+       while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
                int err;
+               unsigned long start_pfn = cc->migrate_pfn;
+
+               /*
+                * Avoid multiple rescans which can happen if a page cannot be
+                * isolated (dirty/writeback in async mode) or if the migrated
+                * pages are being allocated before the pageblock is cleared.
+                * The first rescan will capture the entire pageblock for
+                * migration. If it fails, it'll be marked skip and scanning
+                * will proceed as normal.
+                */
+               cc->rescan = false;
+               if (pageblock_start_pfn(last_migrated_pfn) ==
+                   pageblock_start_pfn(start_pfn)) {
+                       cc->rescan = true;
+               }
 
-               switch (isolate_migratepages(zone, cc)) {
+               switch (isolate_migratepages(cc->zone, cc)) {
                case ISOLATE_ABORT:
                        ret = COMPACT_CONTENDED;
                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                       last_migrated_pfn = 0;
                        goto out;
                case ISOLATE_NONE:
+                       if (update_cached) {
+                               cc->zone->compact_cached_migrate_pfn[1] =
+                                       cc->zone->compact_cached_migrate_pfn[0];
+                       }
+
                        /*
                         * We haven't isolated and migrated anything, but
                         * there might still be unflushed migrations from
@@ -1608,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
                         */
                        goto check_drain;
                case ISOLATE_SUCCESS:
+                       update_cached = false;
+                       last_migrated_pfn = start_pfn;
                        ;
                }
 
@@ -1639,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
                                cc->migrate_pfn = block_end_pfn(
                                                cc->migrate_pfn - 1, cc->order);
                                /* Draining pcplists is useless in this case */
-                               cc->last_migrated_pfn = 0;
-
+                               last_migrated_pfn = 0;
                        }
                }
 
@@ -1652,21 +2211,26 @@ check_drain:
                 * compact_finished() can detect immediately if allocation
                 * would succeed.
                 */
-               if (cc->order > 0 && cc->last_migrated_pfn) {
+               if (cc->order > 0 && last_migrated_pfn) {
                        int cpu;
                        unsigned long current_block_start =
                                block_start_pfn(cc->migrate_pfn, cc->order);
 
-                       if (cc->last_migrated_pfn < current_block_start) {
+                       if (last_migrated_pfn < current_block_start) {
                                cpu = get_cpu();
                                lru_add_drain_cpu(cpu);
-                               drain_local_pages(zone);
+                               drain_local_pages(cc->zone);
                                put_cpu();
                                /* No more flushing until we migrate again */
-                               cc->last_migrated_pfn = 0;
+                               last_migrated_pfn = 0;
                        }
                }
 
+               /* Stop if a page has been captured */
+               if (capc && capc->page) {
+                       ret = COMPACT_SUCCESS;
+                       break;
+               }
        }
 
 out:
@@ -1685,8 +2249,8 @@ out:
                 * Only go back, not forward. The cached pfn might have been
                 * already reset to zone end in compact_finished()
                 */
-               if (free_pfn > zone->compact_cached_free_pfn)
-                       zone->compact_cached_free_pfn = free_pfn;
+               if (free_pfn > cc->zone->compact_cached_free_pfn)
+                       cc->zone->compact_cached_free_pfn = free_pfn;
        }
 
        count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
@@ -1700,7 +2264,8 @@ out:
 
 static enum compact_result compact_zone_order(struct zone *zone, int order,
                gfp_t gfp_mask, enum compact_priority prio,
-               unsigned int alloc_flags, int classzone_idx)
+               unsigned int alloc_flags, int classzone_idx,
+               struct page **capture)
 {
        enum compact_result ret;
        struct compact_control cc = {
@@ -1709,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
                .total_migrate_scanned = 0,
                .total_free_scanned = 0,
                .order = order,
+               .search_order = order,
                .gfp_mask = gfp_mask,
                .zone = zone,
                .mode = (prio == COMPACT_PRIO_ASYNC) ?
@@ -1720,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
                .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
                .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
        };
+       struct capture_control capc = {
+               .cc = &cc,
+               .page = NULL,
+       };
+
+       if (capture)
+               current->capture_control = &capc;
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
 
-       ret = compact_zone(zone, &cc);
+       ret = compact_zone(&cc, &capc);
 
        VM_BUG_ON(!list_empty(&cc.freepages));
        VM_BUG_ON(!list_empty(&cc.migratepages));
 
+       *capture = capc.page;
+       current->capture_control = NULL;
+
        return ret;
 }
 
@@ -1745,7 +2321,7 @@ int sysctl_extfrag_threshold = 500;
  */
 enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               enum compact_priority prio)
+               enum compact_priority prio, struct page **capture)
 {
        int may_perform_io = gfp_mask & __GFP_IO;
        struct zoneref *z;
@@ -1773,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                }
 
                status = compact_zone_order(zone, order, gfp_mask, prio,
-                                       alloc_flags, ac_classzone_idx(ac));
+                               alloc_flags, ac_classzone_idx(ac), capture);
                rc = max(status, rc);
 
                /* The allocation should succeed, stop compacting */
@@ -1841,7 +2417,7 @@ static void compact_node(int nid)
                INIT_LIST_HEAD(&cc.freepages);
                INIT_LIST_HEAD(&cc.migratepages);
 
-               compact_zone(zone, &cc);
+               compact_zone(&cc, NULL);
 
                VM_BUG_ON(!list_empty(&cc.freepages));
                VM_BUG_ON(!list_empty(&cc.migratepages));
@@ -1876,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
        return 0;
 }
 
-int sysctl_extfrag_handler(struct ctl_table *table, int write,
-                       void __user *buffer, size_t *length, loff_t *ppos)
-{
-       proc_dointvec_minmax(table, write, buffer, length, ppos);
-
-       return 0;
-}
-
 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
 static ssize_t sysfs_compact_node(struct device *dev,
                        struct device_attribute *attr,
@@ -1948,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
        struct zone *zone;
        struct compact_control cc = {
                .order = pgdat->kcompactd_max_order,
+               .search_order = pgdat->kcompactd_max_order,
                .total_migrate_scanned = 0,
                .total_free_scanned = 0,
                .classzone_idx = pgdat->kcompactd_classzone_idx,
@@ -1983,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
                if (kthread_should_stop())
                        return;
-               status = compact_zone(zone, &cc);
+               status = compact_zone(&cc, NULL);
 
                if (status == COMPACT_SUCCESS) {
                        compaction_defer_reset(zone, cc.order, false);