Merge tag 'kvmarm-5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...
[sfrench/cifs-2.6.git] / mm / vmscan.c
index e5d52d6a24aff1c7fccd292bb8a2455e1eec1d52..ee4eecc7e1c2177041d00d8eb8d62ca64647206a 100644 (file)
@@ -351,12 +351,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  */
 unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
-       unsigned long lru_size;
+       unsigned long lru_size = 0;
        int zid;
 
-       if (!mem_cgroup_disabled())
-               lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
-       else
+       if (!mem_cgroup_disabled()) {
+               for (zid = 0; zid < MAX_NR_ZONES; zid++)
+                       lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+       } else
                lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
        for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
@@ -932,10 +933,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under the i_pages lock, then this ordering is not required.
         */
-       if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
-               refcount = 1 + HPAGE_PMD_NR;
-       else
-               refcount = 2;
+       refcount = 1 + compound_nr(page);
        if (!page_ref_freeze(page, refcount))
                goto cannot_free;
        /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
@@ -2459,17 +2457,70 @@ out:
        *lru_pages = 0;
        for_each_evictable_lru(lru) {
                int file = is_file_lru(lru);
-               unsigned long size;
+               unsigned long lruvec_size;
                unsigned long scan;
+               unsigned long protection;
+
+               lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+               protection = mem_cgroup_protection(memcg,
+                                                  sc->memcg_low_reclaim);
+
+               if (protection) {
+                       /*
+                        * Scale a cgroup's reclaim pressure by proportioning
+                        * its current usage to its memory.low or memory.min
+                        * setting.
+                        *
+                        * This is important, as otherwise scanning aggression
+                        * becomes extremely binary -- from nothing as we
+                        * approach the memory protection threshold, to totally
+                        * nominal as we exceed it.  This results in requiring
+                        * setting extremely liberal protection thresholds. It
+                        * also means we simply get no protection at all if we
+                        * set it too low, which is not ideal.
+                        *
+                        * If there is any protection in place, we reduce scan
+                        * pressure by how much of the total memory used is
+                        * within protection thresholds.
+                        *
+                        * There is one special case: in the first reclaim pass,
+                        * we skip over all groups that are within their low
+                        * protection. If that fails to reclaim enough pages to
+                        * satisfy the reclaim goal, we come back and override
+                        * the best-effort low protection. However, we still
+                        * ideally want to honor how well-behaved groups are in
+                        * that case instead of simply punishing them all
+                        * equally. As such, we reclaim them based on how much
+                        * memory they are using, reducing the scan pressure
+                        * again by how much of the total memory used is under
+                        * hard protection.
+                        */
+                       unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+                       /* Avoid TOCTOU with earlier protection check */
+                       cgroup_size = max(cgroup_size, protection);
+
+                       scan = lruvec_size - lruvec_size * protection /
+                               cgroup_size;
+
+                       /*
+                        * Minimally target SWAP_CLUSTER_MAX pages to keep
+                        * reclaim moving forwards, avoiding decremeting
+                        * sc->priority further than desirable.
+                        */
+                       scan = max(scan, SWAP_CLUSTER_MAX);
+               } else {
+                       scan = lruvec_size;
+               }
+
+               scan >>= sc->priority;
 
-               size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-               scan = size >> sc->priority;
                /*
                 * If the cgroup's already been deleted, make sure to
                 * scrape out the remaining cache.
                 */
                if (!scan && !mem_cgroup_online(memcg))
-                       scan = min(size, SWAP_CLUSTER_MAX);
+                       scan = min(lruvec_size, SWAP_CLUSTER_MAX);
 
                switch (scan_balance) {
                case SCAN_EQUAL:
@@ -2489,7 +2540,7 @@ out:
                case SCAN_ANON:
                        /* Scan one type exclusively */
                        if ((scan_balance == SCAN_FILE) != file) {
-                               size = 0;
+                               lruvec_size = 0;
                                scan = 0;
                        }
                        break;
@@ -2498,7 +2549,7 @@ out:
                        BUG();
                }
 
-               *lru_pages += size;
+               *lru_pages += lruvec_size;
                nr[lru] = scan;
        }
 }
@@ -2742,6 +2793,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                                memcg_memory_event(memcg, MEMCG_LOW);
                                break;
                        case MEMCG_PROT_NONE:
+                               /*
+                                * All protection thresholds breached. We may
+                                * still choose to vary the scan pressure
+                                * applied based on by how much the cgroup in
+                                * question has exceeded its protection
+                                * thresholds (see get_scan_count).
+                                */
                                break;
                        }