vmscan: protect reading of reclaim_stat with lru_lock

[sfrench/cifs-2.6.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index b94fe1b3da435f34f567e8a23358ec18ccd452ad..a3d669f8e25ec2a96b7ea9925c2618971e2208df 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
  
  #include "internal.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
  struct scan_control {
         /* Incremented by the number of inactive pages that were scanned */
         unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                         /* synchronous write or broken a_ops? */
                         ClearPageReclaim(page);
                 }
+               trace_mm_vmscan_writepage(page,
+                       trace_reclaim_flags(page, sync_writeback));
                 inc_zone_page_state(page, NR_VMSCAN_WRITE);
                 return PAGE_SUCCESS;
         }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
         return PAGEREF_RECLAIM;
  }
  
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+       struct pagevec freed_pvec;
+       struct page *page, *tmp;
+
+       pagevec_init(&freed_pvec, 1);
+
+       list_for_each_entry_safe(page, tmp, free_pages, lru) {
+               list_del(&page->lru);
+               if (!pagevec_add(&freed_pvec, page)) {
+                       __pagevec_free(&freed_pvec);
+                       pagevec_reinit(&freed_pvec);
+               }
+       }
+
+       pagevec_free(&freed_pvec);
+}
+
  /*
   * shrink_page_list() returns the number of reclaimed pages
   */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                         enum pageout_io sync_writeback)
  {
         LIST_HEAD(ret_pages);
-       struct pagevec freed_pvec;
+       LIST_HEAD(free_pages);
         int pgactivate = 0;
         unsigned long nr_reclaimed = 0;
  
         cond_resched();
  
-       pagevec_init(&freed_pvec, 1);
         while (!list_empty(page_list)) {
                 enum page_references references;
                 struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 __clear_page_locked(page);
  free_it:
                 nr_reclaimed++;
-               if (!pagevec_add(&freed_pvec, page)) {
-                       __pagevec_free(&freed_pvec);
-                       pagevec_reinit(&freed_pvec);
-               }
+
+               /*
+                * Is there need to periodically free_page_list? It would
+                * appear not as the counts should be low
+                */
+               list_add(&page->lru, &free_pages);
                 continue;
  
  cull_mlocked:
@@ -832,9 +856,10 @@ keep:
                 list_add(&page->lru, &ret_pages);
                 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
         }
+
+       free_page_list(&free_pages);
+
         list_splice(&ret_pages, page_list);
-       if (pagevec_count(&freed_pvec))
-               __pagevec_free(&freed_pvec);
         count_vm_events(PGACTIVATE, pgactivate);
         return nr_reclaimed;
  }
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 unsigned long *scanned, int order, int mode, int file)
  {
         unsigned long nr_taken = 0;
+       unsigned long nr_lumpy_taken = 0;
+       unsigned long nr_lumpy_dirty = 0;
+       unsigned long nr_lumpy_failed = 0;
         unsigned long scan;
  
         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                 list_move(&cursor_page->lru, dst);
                                 mem_cgroup_del_lru(cursor_page);
                                 nr_taken++;
+                               nr_lumpy_taken++;
+                               if (PageDirty(cursor_page))
+                                       nr_lumpy_dirty++;
                                 scan++;
+                       } else {
+                               if (mode == ISOLATE_BOTH &&
+                                               page_count(cursor_page))
+                                       nr_lumpy_failed++;
                         }
                 }
         }
  
         *scanned = scan;
+
+       trace_mm_vmscan_lru_isolate(order,
+                       nr_to_scan, scan,
+                       nr_taken,
+                       nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+                       mode);
         return nr_taken;
  }
  
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
                         ClearPageActive(page);
                         nr_active++;
                 }
-               count[lru]++;
+               if (count)
+                       count[lru]++;
         }
  
         return nr_active;
@@ -1112,174 +1154,177 @@ static int too_many_isolated(struct zone *zone, int file,
  }
  
  /*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
- * of reclaimed pages
+ * TODO: Try merging with migrations version of putback_lru_pages
   */
-static unsigned long shrink_inactive_list(unsigned long max_scan,
-                       struct zone *zone, struct scan_control *sc,
-                       int priority, int file)
+static noinline_for_stack void
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
+                               unsigned long nr_anon, unsigned long nr_file,
+                               struct list_head *page_list)
  {
-       LIST_HEAD(page_list);
+       struct page *page;
         struct pagevec pvec;
-       unsigned long nr_scanned = 0;
-       unsigned long nr_reclaimed = 0;
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
  
-       while (unlikely(too_many_isolated(zone, file, sc))) {
-               congestion_wait(BLK_RW_ASYNC, HZ/10);
+       pagevec_init(&pvec, 1);
  
-               /* We are about to die and free our memory. Return now. */
-               if (fatal_signal_pending(current))
-                       return SWAP_CLUSTER_MAX;
+       /*
+        * Put back any unfreeable pages.
+        */
+       spin_lock(&zone->lru_lock);
+       while (!list_empty(page_list)) {
+               int lru;
+               page = lru_to_page(page_list);
+               VM_BUG_ON(PageLRU(page));
+               list_del(&page->lru);
+               if (unlikely(!page_evictable(page, NULL))) {
+                       spin_unlock_irq(&zone->lru_lock);
+                       putback_lru_page(page);
+                       spin_lock_irq(&zone->lru_lock);
+                       continue;
+               }
+               SetPageLRU(page);
+               lru = page_lru(page);
+               add_page_to_lru_list(zone, page, lru);
+               if (is_active_lru(lru)) {
+                       int file = is_file_lru(lru);
+                       reclaim_stat->recent_rotated[file]++;
+               }
+               if (!pagevec_add(&pvec, page)) {
+                       spin_unlock_irq(&zone->lru_lock);
+                       __pagevec_release(&pvec);
+                       spin_lock_irq(&zone->lru_lock);
+               }
         }
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
  
+       spin_unlock_irq(&zone->lru_lock);
+       pagevec_release(&pvec);
+}
  
-       pagevec_init(&pvec, 1);
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+                                       struct scan_control *sc,
+                                       unsigned long *nr_anon,
+                                       unsigned long *nr_file,
+                                       struct list_head *isolated_list)
+{
+       unsigned long nr_active;
+       unsigned int count[NR_LRU_LISTS] = { 0, };
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
  
-       lru_add_drain();
-       spin_lock_irq(&zone->lru_lock);
-       do {
-               struct page *page;
-               unsigned long nr_taken;
-               unsigned long nr_scan;
-               unsigned long nr_freed;
-               unsigned long nr_active;
-               unsigned int count[NR_LRU_LISTS] = { 0, };
-               int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
-               unsigned long nr_anon;
-               unsigned long nr_file;
+       nr_active = clear_active_flags(isolated_list, count);
+       __count_vm_events(PGDEACTIVATE, nr_active);
  
-               if (scanning_global_lru(sc)) {
-                       nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
-                                                       &page_list, &nr_scan,
-                                                       sc->order, mode,
-                                                       zone, 0, file);
-                       zone->pages_scanned += nr_scan;
-                       if (current_is_kswapd())
-                               __count_zone_vm_events(PGSCAN_KSWAPD, zone,
-                                                      nr_scan);
-                       else
-                               __count_zone_vm_events(PGSCAN_DIRECT, zone,
-                                                      nr_scan);
-               } else {
-                       nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
-                                                       &page_list, &nr_scan,
-                                                       sc->order, mode,
-                                                       zone, sc->mem_cgroup,
-                                                       0, file);
-                       /*
-                        * mem_cgroup_isolate_pages() keeps track of
-                        * scanned pages on its own.
-                        */
-               }
+       __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+                             -count[LRU_ACTIVE_FILE]);
+       __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+                             -count[LRU_INACTIVE_FILE]);
+       __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+                             -count[LRU_ACTIVE_ANON]);
+       __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+                             -count[LRU_INACTIVE_ANON]);
  
-               if (nr_taken == 0)
-                       goto done;
+       *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+       *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
  
-               nr_active = clear_active_flags(&page_list, count);
-               __count_vm_events(PGDEACTIVATE, nr_active);
+       reclaim_stat->recent_scanned[0] += *nr_anon;
+       reclaim_stat->recent_scanned[1] += *nr_file;
+}
  
-               __mod_zone_page_state(zone, NR_ACTIVE_FILE,
-                                               -count[LRU_ACTIVE_FILE]);
-               __mod_zone_page_state(zone, NR_INACTIVE_FILE,
-                                               -count[LRU_INACTIVE_FILE]);
-               __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                                               -count[LRU_ACTIVE_ANON]);
-               __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                                               -count[LRU_INACTIVE_ANON]);
+/*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+                       struct scan_control *sc, int priority, int file)
+{
+       LIST_HEAD(page_list);
+       unsigned long nr_scanned;
+       unsigned long nr_reclaimed = 0;
+       unsigned long nr_taken;
+       unsigned long nr_active;
+       unsigned long nr_anon;
+       unsigned long nr_file;
  
-               nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-               nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-               __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
-               __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+       while (unlikely(too_many_isolated(zone, file, sc))) {
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
  
-               reclaim_stat->recent_scanned[0] += nr_anon;
-               reclaim_stat->recent_scanned[1] += nr_file;
+               /* We are about to die and free our memory. Return now. */
+               if (fatal_signal_pending(current))
+                       return SWAP_CLUSTER_MAX;
+       }
  
-               spin_unlock_irq(&zone->lru_lock);
  
-               nr_scanned += nr_scan;
-               nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+       lru_add_drain();
+       spin_lock_irq(&zone->lru_lock);
  
+       if (scanning_global_lru(sc)) {
+               nr_taken = isolate_pages_global(nr_to_scan,
+                       &page_list, &nr_scanned, sc->order,
+                       sc->lumpy_reclaim_mode ?
+                               ISOLATE_BOTH : ISOLATE_INACTIVE,
+                       zone, 0, file);
+               zone->pages_scanned += nr_scanned;
+               if (current_is_kswapd())
+                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+                                              nr_scanned);
+               else
+                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
+                                              nr_scanned);
+       } else {
+               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                       &page_list, &nr_scanned, sc->order,
+                       sc->lumpy_reclaim_mode ?
+                               ISOLATE_BOTH : ISOLATE_INACTIVE,
+                       zone, sc->mem_cgroup,
+                       0, file);
                 /*
-                * If we are direct reclaiming for contiguous pages and we do
-                * not reclaim everything in the list, try again and wait
-                * for IO to complete. This will stall high-order allocations
-                * but that should be acceptable to the caller
+                * mem_cgroup_isolate_pages() keeps track of
+                * scanned pages on its own.
                  */
-               if (nr_freed < nr_taken && !current_is_kswapd() &&
-                   sc->lumpy_reclaim_mode) {
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+       }
  
-                       /*
-                        * The attempt at page out may have made some
-                        * of the pages active, mark them inactive again.
-                        */
-                       nr_active = clear_active_flags(&page_list, count);
-                       count_vm_events(PGDEACTIVATE, nr_active);
+       if (nr_taken == 0) {
+               spin_unlock_irq(&zone->lru_lock);
+               return 0;
+       }
  
-                       nr_freed += shrink_page_list(&page_list, sc,
-                                                       PAGEOUT_IO_SYNC);
-               }
+       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
+
+       spin_unlock_irq(&zone->lru_lock);
  
-               nr_reclaimed += nr_freed;
+       nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
  
-               local_irq_disable();
-               if (current_is_kswapd())
-                       __count_vm_events(KSWAPD_STEAL, nr_freed);
-               __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+       /*
+        * If we are direct reclaiming for contiguous pages and we do
+        * not reclaim everything in the list, try again and wait
+        * for IO to complete. This will stall high-order allocations
+        * but that should be acceptable to the caller
+        */
+       if (nr_reclaimed < nr_taken && !current_is_kswapd() &&
+                       sc->lumpy_reclaim_mode) {
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
  
-               spin_lock(&zone->lru_lock);
                 /*
-                * Put back any unfreeable pages.
+                * The attempt at page out may have made some
+                * of the pages active, mark them inactive again.
                  */
-               while (!list_empty(&page_list)) {
-                       int lru;
-                       page = lru_to_page(&page_list);
-                       VM_BUG_ON(PageLRU(page));
-                       list_del(&page->lru);
-                       if (unlikely(!page_evictable(page, NULL))) {
-                               spin_unlock_irq(&zone->lru_lock);
-                               putback_lru_page(page);
-                               spin_lock_irq(&zone->lru_lock);
-                               continue;
-                       }
-                       SetPageLRU(page);
-                       lru = page_lru(page);
-                       add_page_to_lru_list(zone, page, lru);
-                       if (is_active_lru(lru)) {
-                               int file = is_file_lru(lru);
-                               reclaim_stat->recent_rotated[file]++;
-                       }
-                       if (!pagevec_add(&pvec, page)) {
-                               spin_unlock_irq(&zone->lru_lock);
-                               __pagevec_release(&pvec);
-                               spin_lock_irq(&zone->lru_lock);
-                       }
-               }
-               __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-               __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+               nr_active = clear_active_flags(&page_list, NULL);
+               count_vm_events(PGDEACTIVATE, nr_active);
  
-       } while (nr_scanned < max_scan);
+               nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+       }
  
-done:
-       spin_unlock_irq(&zone->lru_lock);
-       pagevec_release(&pvec);
-       return nr_reclaimed;
-}
+       local_irq_disable();
+       if (current_is_kswapd())
+               __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+       __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
  
-/*
- * We are about to scan this zone at a certain priority level.  If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-       if (priority < zone->prev_priority)
-               zone->prev_priority = priority;
+       putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+       return nr_reclaimed;
  }
  
  /*
@@ -1582,6 +1627,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                 }
         }
  
+       /*
+        * With swappiness at 100, anonymous and file have the same priority.
+        * This scanning priority is essentially the inverse of IO cost.
+        */
+       anon_prio = sc->swappiness;
+       file_prio = 200 - sc->swappiness;
+
         /*
          * OK, so we have swap space and a fair amount of page cache
          * pages.  We use the recently rotated / recently scanned
@@ -1593,27 +1645,17 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
          *
          * anon in [0], file in [1]
          */
+       spin_lock_irq(&zone->lru_lock);
         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
-               spin_lock_irq(&zone->lru_lock);
                 reclaim_stat->recent_scanned[0] /= 2;
                 reclaim_stat->recent_rotated[0] /= 2;
-               spin_unlock_irq(&zone->lru_lock);
         }
  
         if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
-               spin_lock_irq(&zone->lru_lock);
                 reclaim_stat->recent_scanned[1] /= 2;
                 reclaim_stat->recent_rotated[1] /= 2;
-               spin_unlock_irq(&zone->lru_lock);
         }
  
-       /*
-        * With swappiness at 100, anonymous and file have the same priority.
-        * This scanning priority is essentially the inverse of IO cost.
-        */
-       anon_prio = sc->swappiness;
-       file_prio = 200 - sc->swappiness;
-
         /*
          * The amount of pressure on anon vs file pages is inversely
          * proportional to the fraction of recently scanned pages on
@@ -1624,6 +1666,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
  
         fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
         fp /= reclaim_stat->recent_rotated[1] + 1;
+       spin_unlock_irq(&zone->lru_lock);
  
         fraction[0] = ap;
         fraction[1] = fp;
@@ -1729,13 +1772,12 @@ static void shrink_zone(int priority, struct zone *zone,
  static bool shrink_zones(int priority, struct zonelist *zonelist,
                                         struct scan_control *sc)
  {
-       enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
         struct zoneref *z;
         struct zone *zone;
         bool all_unreclaimable = true;
  
-       for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
-                                       sc->nodemask) {
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
                 if (!populated_zone(zone))
                         continue;
                 /*
@@ -1745,17 +1787,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                 if (scanning_global_lru(sc)) {
                         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                 continue;
-                       note_zone_scanning_priority(zone, priority);
-
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                 continue;       /* Let kswapd poll it */
-               } else {
-                       /*
-                        * Ignore cpuset limitation here. We just want to reduce
-                        * # of used pages by us regardless of memory shortage.
-                        */
-                       mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
-                                                       priority);
                 }
  
                 shrink_zone(priority, zone, sc);
@@ -1787,10 +1820,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
         bool all_unreclaimable;
         unsigned long total_scanned = 0;
         struct reclaim_state *reclaim_state = current->reclaim_state;
-       unsigned long lru_pages = 0;
         struct zoneref *z;
         struct zone *zone;
-       enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
         unsigned long writeback_threshold;
  
         get_mems_allowed();
@@ -1798,18 +1829,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
  
         if (scanning_global_lru(sc))
                 count_vm_event(ALLOCSTALL);
-       /*
-        * mem_cgroup will not do shrink_slab.
-        */
-       if (scanning_global_lru(sc)) {
-               for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
-                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                               continue;
-
-                       lru_pages += zone_reclaimable_pages(zone);
-               }
-       }
  
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                 sc->nr_scanned = 0;
@@ -1821,6 +1840,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                  * over limit cgroups
                  */
                 if (scanning_global_lru(sc)) {
+                       unsigned long lru_pages = 0;
+                       for_each_zone_zonelist(zone, z, zonelist,
+                                       gfp_zone(sc->gfp_mask)) {
+                               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                       continue;
+
+                               lru_pages += zone_reclaimable_pages(zone);
+                       }
+
                         shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                         if (reclaim_state) {
                                 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1889,6 @@ out:
         if (priority < 0)
                 priority = 0;
  
-       if (scanning_global_lru(sc)) {
-               for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
-                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                               continue;
-
-                       zone->prev_priority = priority;
-               }
-       } else
-               mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-
         delayacct_freepages_end();
         put_mems_allowed();
  
@@ -1888,6 +1905,7 @@ out:
  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                 gfp_t gfp_mask, nodemask_t *nodemask)
  {
+       unsigned long nr_reclaimed;
         struct scan_control sc = {
                 .gfp_mask = gfp_mask,
                 .may_writepage = !laptop_mode,
@@ -1900,7 +1918,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                 .nodemask = nodemask,
         };
  
-       return do_try_to_free_pages(zonelist, &sc);
+       trace_mm_vmscan_direct_reclaim_begin(order,
+                               sc.may_writepage,
+                               gfp_mask);
+
+       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+       trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+       return nr_reclaimed;
  }
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -2028,22 +2054,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                 .order = order,
                 .mem_cgroup = NULL,
         };
-       /*
-        * temp_priority is used to remember the scanning priority at which
-        * this zone was successfully refilled to
-        * free_pages == high_wmark_pages(zone).
-        */
-       int temp_priority[MAX_NR_ZONES];
-
  loop_again:
         total_scanned = 0;
         sc.nr_reclaimed = 0;
         sc.may_writepage = !laptop_mode;
         count_vm_event(PAGEOUTRUN);
  
-       for (i = 0; i < pgdat->nr_zones; i++)
-               temp_priority[i] = DEF_PRIORITY;
-
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                 int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                 unsigned long lru_pages = 0;
@@ -2111,9 +2127,7 @@ loop_again:
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                 continue;
  
-                       temp_priority[i] = priority;
                         sc.nr_scanned = 0;
-                       note_zone_scanning_priority(zone, priority);
  
                         nid = pgdat->node_id;
                         zid = zone_idx(zone);
@@ -2186,16 +2200,6 @@ loop_again:
                         break;
         }
  out:
-       /*
-        * Note within each zone the priority level at which this zone was
-        * brought into a happy state.  So that the next thread which scans this
-        * zone will start out at that priority level.
-        */
-       for (i = 0; i < pgdat->nr_zones; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               zone->prev_priority = temp_priority[i];
-       }
         if (!all_zones_ok) {
                 cond_resched();
  
@@ -2299,9 +2303,10 @@ static int kswapd(void *p)
                                  * premature sleep. If not, then go fully
                                  * to sleep until explicitly woken up
                                  */
-                               if (!sleeping_prematurely(pgdat, order, remaining))
+                               if (!sleeping_prematurely(pgdat, order, remaining)) {
+                                       trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                                         schedule();
-                               else {
+                               } else {
                                         if (remaining)
                                                 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
                                         else
@@ -2321,8 +2326,10 @@ static int kswapd(void *p)
                  * We can speed up thawing tasks if we don't call balance_pgdat
                  * after returning from the refrigerator
                  */
-               if (!ret)
+               if (!ret) {
+                       trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
                         balance_pgdat(pgdat, order);
+               }
         }
         return 0;
  }
@@ -2342,6 +2349,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                 return;
         if (pgdat->kswapd_max_order < order)
                 pgdat->kswapd_max_order = order;
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                 return;
         if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2598,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 .swappiness = vm_swappiness,
                 .order = order,
         };
-       unsigned long slab_reclaimable;
+       unsigned long nr_slab_pages0, nr_slab_pages1;
  
-       disable_swap_token();
         cond_resched();
         /*
          * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2618,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                  */
                 priority = ZONE_RECLAIM_PRIORITY;
                 do {
-                       note_zone_scanning_priority(zone, priority);
                         shrink_zone(priority, zone, &sc);
                         priority--;
                 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
         }
  
-       slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-       if (slab_reclaimable > zone->min_slab_pages) {
+       nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+       if (nr_slab_pages0 > zone->min_slab_pages) {
                 /*
                  * shrink_slab() does not currently allow us to determine how
                  * many pages were freed in this zone. So we take the current
@@ -2630,16 +2636,17 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                  * take a long time.
                  */
                 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
-                       zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
-                               slab_reclaimable - nr_pages)
+                      (zone_page_state(zone, NR_SLAB_RECLAIMABLE) + nr_pages >
+                               nr_slab_pages0))
                         ;
  
                 /*
                  * Update nr_reclaimed by the number of slab pages we
                  * reclaimed from this zone.
                  */
-               sc.nr_reclaimed += slab_reclaimable -
-                       zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+               nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+               if (nr_slab_pages1 < nr_slab_pages0)
+                       sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
         }
  
         p->reclaim_state = NULL;