mm: memcontrol: use per-cpu stocks for socket memory uncharging
[sfrench/cifs-2.6.git] / mm / memcontrol.c
index e09741af816f8a6d5343546ebd23eb7f25f8ab54..ca83f3854e4fcf2272474d9200cebd30c75eba82 100644 (file)
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
  */
 
 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-                                     enum memcg_event_item event)
+                                     int event)
 {
        unsigned long val = 0;
        int cpu;
@@ -917,7 +919,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                struct css_task_iter it;
                struct task_struct *task;
 
-               css_task_iter_start(&iter->css, &it);
+               css_task_iter_start(&iter->css, 0, &it);
                while (!ret && (task = css_task_iter_next(&it)))
                        ret = fn(task, arg);
                css_task_iter_end(&it);
@@ -1790,6 +1792,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
        }
        stock->nr_pages += nr_pages;
 
+       if (stock->nr_pages > CHARGE_BATCH)
+               drain_stock(stock);
+
        local_irq_restore(flags);
 }
 
@@ -1915,7 +1920,7 @@ retry:
         * bypass the last charges so that they can exit quickly and
         * free their memory.
         */
-       if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+       if (unlikely(tsk_is_oom_victim(current) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
                goto force;
@@ -4319,6 +4324,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        }
        spin_unlock(&memcg->event_list_lock);
 
+       memcg->low = 0;
+
        memcg_offline_kmem(memcg);
        wb_memcg_offline(memcg);
 
@@ -4410,12 +4417,13 @@ enum mc_target_type {
        MC_TARGET_NONE = 0,
        MC_TARGET_PAGE,
        MC_TARGET_SWAP,
+       MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t ptent)
 {
-       struct page *page = vm_normal_page(vma, addr, ptent);
+       struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
        if (!page || !page_mapped(page))
                return NULL;
@@ -4432,7 +4440,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
                        pte_t ptent, swp_entry_t *entry)
 {
@@ -4441,6 +4449,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 
        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                return NULL;
+
+       /*
+        * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+        * a device and because they are not accessible by CPU they are store
+        * as special swap entry in the CPU page table.
+        */
+       if (is_device_private_entry(ent)) {
+               page = device_private_entry_to_page(ent);
+               /*
+                * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+                * a refcount of 1 when free (unlike normal page)
+                */
+               if (!page_ref_add_unless(page, 1, 1))
+                       return NULL;
+               return page;
+       }
+
        /*
         * Because lookup_swap_cache() updates some statistics counter,
         * we call find_get_page() with swapper_space directly.
@@ -4601,6 +4626,13 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *     For now we such page is charge like a regular page would be as for all
+ *     intent and purposes it is just special memory taking the place of a
+ *     regular page.
+ *
+ *     See Documentations/vm/hmm.txt and include/linux/hmm.h
  *
  * Called with pte lock held.
  */
@@ -4629,14 +4661,20 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                 */
                if (page->mem_cgroup == mc.from) {
                        ret = MC_TARGET_PAGE;
+                       if (is_device_private_page(page) ||
+                           is_device_public_page(page))
+                               ret = MC_TARGET_DEVICE;
                        if (target)
                                target->page = page;
                }
                if (!ret || !target)
                        put_page(page);
        }
-       /* There is a swap entry and a page doesn't exist or isn't charged */
-       if (ent.val && !ret &&
+       /*
+        * There is a swap entry and a page doesn't exist or isn't charged.
+        * But we cannot move a tail-page in a THP.
+        */
+       if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
                ret = MC_TARGET_SWAP;
                if (target)
@@ -4647,8 +4685,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -4657,6 +4695,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
        struct page *page = NULL;
        enum mc_target_type ret = MC_TARGET_NONE;
 
+       if (unlikely(is_swap_pmd(pmd))) {
+               VM_BUG_ON(thp_migration_supported() &&
+                                 !is_pmd_migration_entry(pmd));
+               return ret;
+       }
        page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
        if (!(mc.flags & MOVE_ANON))
@@ -4688,6 +4731,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 
        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
+               /*
+                * Note their can not be MC_TARGET_DEVICE for now as we do not
+                * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+                * MEMORY_DEVICE_PRIVATE but this might change.
+                */
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@ -4903,6 +4951,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                putback_lru_page(page);
                        }
                        put_page(page);
+               } else if (target_type == MC_TARGET_DEVICE) {
+                       page = target.page;
+                       if (!mem_cgroup_move_account(page, true,
+                                                    mc.from, mc.to)) {
+                               mc.precharge -= HPAGE_PMD_NR;
+                               mc.moved_charge += HPAGE_PMD_NR;
+                       }
+                       put_page(page);
                }
                spin_unlock(ptl);
                return 0;
@@ -4914,12 +4970,16 @@ retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
+               bool device = false;
                swp_entry_t ent;
 
                if (!mc.precharge)
                        break;
 
                switch (get_mctgt_type(vma, addr, ptent, &target)) {
+               case MC_TARGET_DEVICE:
+                       device = true;
+                       /* fall through */
                case MC_TARGET_PAGE:
                        page = target.page;
                        /*
@@ -4930,7 +4990,7 @@ retry:
                         */
                        if (PageTransCompound(page))
                                goto put;
-                       if (isolate_lru_page(page))
+                       if (!device && isolate_lru_page(page))
                                goto put;
                        if (!mem_cgroup_move_account(page, false,
                                                mc.from, mc.to)) {
@@ -4938,7 +4998,8 @@ retry:
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
                        }
-                       putback_lru_page(page);
+                       if (!device)
+                               putback_lru_page(page);
 put:                   /* get_mctgt_type() gets the page */
                        put_page(page);
                        break;
@@ -5423,7 +5484,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * in turn serializes uncharging.
                 */
                VM_BUG_ON_PAGE(!PageLocked(page), page);
-               if (page->mem_cgroup)
+               if (compound_head(page)->mem_cgroup)
                        goto out;
 
                if (do_swap_account) {
@@ -5528,48 +5589,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
        cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-                          unsigned long nr_anon, unsigned long nr_file,
-                          unsigned long nr_kmem, unsigned long nr_huge,
-                          unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+       struct mem_cgroup *memcg;
+       unsigned long pgpgout;
+       unsigned long nr_anon;
+       unsigned long nr_file;
+       unsigned long nr_kmem;
+       unsigned long nr_huge;
+       unsigned long nr_shmem;
+       struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 {
-       unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+       memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+       unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
        unsigned long flags;
 
-       if (!mem_cgroup_is_root(memcg)) {
-               page_counter_uncharge(&memcg->memory, nr_pages);
+       if (!mem_cgroup_is_root(ug->memcg)) {
+               page_counter_uncharge(&ug->memcg->memory, nr_pages);
                if (do_memsw_account())
-                       page_counter_uncharge(&memcg->memsw, nr_pages);
-               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-                       page_counter_uncharge(&memcg->kmem, nr_kmem);
-               memcg_oom_recover(memcg);
+                       page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+                       page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+               memcg_oom_recover(ug->memcg);
        }
 
        local_irq_save(flags);
-       __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-       __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-       __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-       __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-       __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-       __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-       memcg_check_events(memcg, dummy_page);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+       __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+       __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+       __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+       memcg_check_events(ug->memcg, ug->dummy_page);
        local_irq_restore(flags);
 
-       if (!mem_cgroup_is_root(memcg))
-               css_put_many(&memcg->css, nr_pages);
+       if (!mem_cgroup_is_root(ug->memcg))
+               css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+       if (!page->mem_cgroup)
+               return;
+
+       /*
+        * Nobody should be changing or seriously looking at
+        * page->mem_cgroup at this point, we have fully
+        * exclusive access to the page.
+        */
+
+       if (ug->memcg != page->mem_cgroup) {
+               if (ug->memcg) {
+                       uncharge_batch(ug);
+                       uncharge_gather_clear(ug);
+               }
+               ug->memcg = page->mem_cgroup;
+       }
+
+       if (!PageKmemcg(page)) {
+               unsigned int nr_pages = 1;
+
+               if (PageTransHuge(page)) {
+                       nr_pages <<= compound_order(page);
+                       ug->nr_huge += nr_pages;
+               }
+               if (PageAnon(page))
+                       ug->nr_anon += nr_pages;
+               else {
+                       ug->nr_file += nr_pages;
+                       if (PageSwapBacked(page))
+                               ug->nr_shmem += nr_pages;
+               }
+               ug->pgpgout++;
+       } else {
+               ug->nr_kmem += 1 << compound_order(page);
+               __ClearPageKmemcg(page);
+       }
+
+       ug->dummy_page = page;
+       page->mem_cgroup = NULL;
 }
 
 static void uncharge_list(struct list_head *page_list)
 {
-       struct mem_cgroup *memcg = NULL;
-       unsigned long nr_shmem = 0;
-       unsigned long nr_anon = 0;
-       unsigned long nr_file = 0;
-       unsigned long nr_huge = 0;
-       unsigned long nr_kmem = 0;
-       unsigned long pgpgout = 0;
+       struct uncharge_gather ug;
        struct list_head *next;
-       struct page *page;
+
+       uncharge_gather_clear(&ug);
 
        /*
         * Note that the list can be a single page->lru; hence the
@@ -5577,57 +5692,16 @@ static void uncharge_list(struct list_head *page_list)
         */
        next = page_list->next;
        do {
+               struct page *page;
+
                page = list_entry(next, struct page, lru);
                next = page->lru.next;
 
-               VM_BUG_ON_PAGE(PageLRU(page), page);
-               VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
-               if (!page->mem_cgroup)
-                       continue;
-
-               /*
-                * Nobody should be changing or seriously looking at
-                * page->mem_cgroup at this point, we have fully
-                * exclusive access to the page.
-                */
-
-               if (memcg != page->mem_cgroup) {
-                       if (memcg) {
-                               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                                              nr_kmem, nr_huge, nr_shmem, page);
-                               pgpgout = nr_anon = nr_file = nr_kmem = 0;
-                               nr_huge = nr_shmem = 0;
-                       }
-                       memcg = page->mem_cgroup;
-               }
-
-               if (!PageKmemcg(page)) {
-                       unsigned int nr_pages = 1;
-
-                       if (PageTransHuge(page)) {
-                               nr_pages <<= compound_order(page);
-                               nr_huge += nr_pages;
-                       }
-                       if (PageAnon(page))
-                               nr_anon += nr_pages;
-                       else {
-                               nr_file += nr_pages;
-                               if (PageSwapBacked(page))
-                                       nr_shmem += nr_pages;
-                       }
-                       pgpgout++;
-               } else {
-                       nr_kmem += 1 << compound_order(page);
-                       __ClearPageKmemcg(page);
-               }
-
-               page->mem_cgroup = NULL;
+               uncharge_page(page, &ug);
        } while (next != page_list);
 
-       if (memcg)
-               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                              nr_kmem, nr_huge, nr_shmem, page);
+       if (ug.memcg)
+               uncharge_batch(&ug);
 }
 
 /**
@@ -5639,6 +5713,8 @@ static void uncharge_list(struct list_head *page_list)
  */
 void mem_cgroup_uncharge(struct page *page)
 {
+       struct uncharge_gather ug;
+
        if (mem_cgroup_disabled())
                return;
 
@@ -5646,8 +5722,9 @@ void mem_cgroup_uncharge(struct page *page)
        if (!page->mem_cgroup)
                return;
 
-       INIT_LIST_HEAD(&page->lru);
-       uncharge_list(&page->lru);
+       uncharge_gather_clear(&ug);
+       uncharge_page(page, &ug);
+       uncharge_batch(&ug);
 }
 
 /**
@@ -5812,8 +5889,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 
        this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
 
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       css_put_many(&memcg->css, nr_pages);
+       refill_stock(memcg, nr_pages);
 }
 
 static int __init cgroup_memory(char *s)
@@ -5906,6 +5982,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
        struct mem_cgroup *memcg, *swap_memcg;
+       unsigned int nr_entries;
        unsigned short oldid;
 
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5926,19 +6003,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * ancestor for the swap instead and transfer the memory+swap charge.
         */
        swap_memcg = mem_cgroup_id_get_online(memcg);
-       oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+       nr_entries = hpage_nr_pages(page);
+       /* Get references for the tail pages, too */
+       if (nr_entries > 1)
+               mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+       oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+                                  nr_entries);
        VM_BUG_ON_PAGE(oldid, page);
-       mem_cgroup_swap_statistics(swap_memcg, 1);
+       mem_cgroup_swap_statistics(swap_memcg, nr_entries);
 
        page->mem_cgroup = NULL;
 
        if (!mem_cgroup_is_root(memcg))
-               page_counter_uncharge(&memcg->memory, 1);
+               page_counter_uncharge(&memcg->memory, nr_entries);
 
        if (memcg != swap_memcg) {
                if (!mem_cgroup_is_root(swap_memcg))
-                       page_counter_charge(&swap_memcg->memsw, 1);
-               page_counter_uncharge(&memcg->memsw, 1);
+                       page_counter_charge(&swap_memcg->memsw, nr_entries);
+               page_counter_uncharge(&memcg->memsw, nr_entries);
        }
 
        /*
@@ -5948,7 +6030,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, false, -1);
+       mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+                                    -nr_entries);
        memcg_check_events(memcg, page);
 
        if (!mem_cgroup_is_root(memcg))