mm: memcontrol: use per-cpu stocks for socket memory uncharging
[sfrench/cifs-2.6.git] / mm / memcontrol.c
index 3df3c04d73ab08e3bbb663f25b2e195b396d2149..ca83f3854e4fcf2272474d9200cebd30c75eba82 100644 (file)
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
  */
 
 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-                                     enum memcg_event_item event)
+                                     int event)
 {
        unsigned long val = 0;
        int cpu;
@@ -917,7 +919,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                struct css_task_iter it;
                struct task_struct *task;
 
-               css_task_iter_start(&iter->css, &it);
+               css_task_iter_start(&iter->css, 0, &it);
                while (!ret && (task = css_task_iter_next(&it)))
                        ret = fn(task, arg);
                css_task_iter_end(&it);
@@ -1611,9 +1613,13 @@ cleanup:
  * @page: the page
  *
  * This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
  */
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
 {
        struct mem_cgroup *memcg;
        unsigned long flags;
@@ -1622,18 +1628,24 @@ void lock_page_memcg(struct page *page)
         * The RCU lock is held throughout the transaction.  The fast
         * path can get away without acquiring the memcg->move_lock
         * because page moving starts with an RCU grace period.
-        */
+        *
+        * The RCU lock also protects the memcg from being freed when
+        * the page state that is going to change is the only thing
+        * preventing the page itself from being freed. E.g. writeback
+        * doesn't hold a page reference and relies on PG_writeback to
+        * keep off truncation, migration and so forth.
+         */
        rcu_read_lock();
 
        if (mem_cgroup_disabled())
-               return;
+               return NULL;
 again:
        memcg = page->mem_cgroup;
        if (unlikely(!memcg))
-               return;
+               return NULL;
 
        if (atomic_read(&memcg->moving_account) <= 0)
-               return;
+               return memcg;
 
        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
@@ -1649,18 +1661,18 @@ again:
        memcg->move_lock_task = current;
        memcg->move_lock_flags = flags;
 
-       return;
+       return memcg;
 }
 EXPORT_SYMBOL(lock_page_memcg);
 
 /**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
  */
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
-       struct mem_cgroup *memcg = page->mem_cgroup;
-
        if (memcg && memcg->move_lock_task == current) {
                unsigned long flags = memcg->move_lock_flags;
 
@@ -1672,6 +1684,15 @@ void unlock_page_memcg(struct page *page)
 
        rcu_read_unlock();
 }
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+       __unlock_page_memcg(page->mem_cgroup);
+}
 EXPORT_SYMBOL(unlock_page_memcg);
 
 /*
@@ -1771,6 +1792,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
        }
        stock->nr_pages += nr_pages;
 
+       if (stock->nr_pages > CHARGE_BATCH)
+               drain_stock(stock);
+
        local_irq_restore(flags);
 }
 
@@ -1896,7 +1920,7 @@ retry:
         * bypass the last charges so that they can exit quickly and
         * free their memory.
         */
-       if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+       if (unlikely(tsk_is_oom_victim(current) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
                goto force;
@@ -4300,6 +4324,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        }
        spin_unlock(&memcg->event_list_lock);
 
+       memcg->low = 0;
+
        memcg_offline_kmem(memcg);
        wb_memcg_offline(memcg);
 
@@ -4391,12 +4417,13 @@ enum mc_target_type {
        MC_TARGET_NONE = 0,
        MC_TARGET_PAGE,
        MC_TARGET_SWAP,
+       MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t ptent)
 {
-       struct page *page = vm_normal_page(vma, addr, ptent);
+       struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
        if (!page || !page_mapped(page))
                return NULL;
@@ -4413,7 +4440,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
                        pte_t ptent, swp_entry_t *entry)
 {
@@ -4422,6 +4449,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 
        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                return NULL;
+
+       /*
+        * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+        * a device and because they are not accessible by CPU they are store
+        * as special swap entry in the CPU page table.
+        */
+       if (is_device_private_entry(ent)) {
+               page = device_private_entry_to_page(ent);
+               /*
+                * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+                * a refcount of 1 when free (unlike normal page)
+                */
+               if (!page_ref_add_unless(page, 1, 1))
+                       return NULL;
+               return page;
+       }
+
        /*
         * Because lookup_swap_cache() updates some statistics counter,
         * we call find_get_page() with swapper_space directly.
@@ -4582,6 +4626,13 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *     For now we such page is charge like a regular page would be as for all
+ *     intent and purposes it is just special memory taking the place of a
+ *     regular page.
+ *
+ *     See Documentations/vm/hmm.txt and include/linux/hmm.h
  *
  * Called with pte lock held.
  */
@@ -4610,14 +4661,20 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                 */
                if (page->mem_cgroup == mc.from) {
                        ret = MC_TARGET_PAGE;
+                       if (is_device_private_page(page) ||
+                           is_device_public_page(page))
+                               ret = MC_TARGET_DEVICE;
                        if (target)
                                target->page = page;
                }
                if (!ret || !target)
                        put_page(page);
        }
-       /* There is a swap entry and a page doesn't exist or isn't charged */
-       if (ent.val && !ret &&
+       /*
+        * There is a swap entry and a page doesn't exist or isn't charged.
+        * But we cannot move a tail-page in a THP.
+        */
+       if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
                ret = MC_TARGET_SWAP;
                if (target)
@@ -4628,8 +4685,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -4638,6 +4695,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
        struct page *page = NULL;
        enum mc_target_type ret = MC_TARGET_NONE;
 
+       if (unlikely(is_swap_pmd(pmd))) {
+               VM_BUG_ON(thp_migration_supported() &&
+                                 !is_pmd_migration_entry(pmd));
+               return ret;
+       }
        page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
        if (!(mc.flags & MOVE_ANON))
@@ -4669,6 +4731,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 
        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
+               /*
+                * Note their can not be MC_TARGET_DEVICE for now as we do not
+                * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+                * MEMORY_DEVICE_PRIVATE but this might change.
+                */
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@ -4884,6 +4951,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                putback_lru_page(page);
                        }
                        put_page(page);
+               } else if (target_type == MC_TARGET_DEVICE) {
+                       page = target.page;
+                       if (!mem_cgroup_move_account(page, true,
+                                                    mc.from, mc.to)) {
+                               mc.precharge -= HPAGE_PMD_NR;
+                               mc.moved_charge += HPAGE_PMD_NR;
+                       }
+                       put_page(page);
                }
                spin_unlock(ptl);
                return 0;
@@ -4895,12 +4970,16 @@ retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
+               bool device = false;
                swp_entry_t ent;
 
                if (!mc.precharge)
                        break;
 
                switch (get_mctgt_type(vma, addr, ptent, &target)) {
+               case MC_TARGET_DEVICE:
+                       device = true;
+                       /* fall through */
                case MC_TARGET_PAGE:
                        page = target.page;
                        /*
@@ -4911,7 +4990,7 @@ retry:
                         */
                        if (PageTransCompound(page))
                                goto put;
-                       if (isolate_lru_page(page))
+                       if (!device && isolate_lru_page(page))
                                goto put;
                        if (!mem_cgroup_move_account(page, false,
                                                mc.from, mc.to)) {
@@ -4919,7 +4998,8 @@ retry:
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
                        }
-                       putback_lru_page(page);
+                       if (!device)
+                               putback_lru_page(page);
 put:                   /* get_mctgt_type() gets the page */
                        put_page(page);
                        break;
@@ -5404,7 +5484,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * in turn serializes uncharging.
                 */
                VM_BUG_ON_PAGE(!PageLocked(page), page);
-               if (page->mem_cgroup)
+               if (compound_head(page)->mem_cgroup)
                        goto out;
 
                if (do_swap_account) {
@@ -5509,48 +5589,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
        cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-                          unsigned long nr_anon, unsigned long nr_file,
-                          unsigned long nr_kmem, unsigned long nr_huge,
-                          unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+       struct mem_cgroup *memcg;
+       unsigned long pgpgout;
+       unsigned long nr_anon;
+       unsigned long nr_file;
+       unsigned long nr_kmem;
+       unsigned long nr_huge;
+       unsigned long nr_shmem;
+       struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
+{
+       memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
 {
-       unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+       unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
        unsigned long flags;
 
-       if (!mem_cgroup_is_root(memcg)) {
-               page_counter_uncharge(&memcg->memory, nr_pages);
+       if (!mem_cgroup_is_root(ug->memcg)) {
+               page_counter_uncharge(&ug->memcg->memory, nr_pages);
                if (do_memsw_account())
-                       page_counter_uncharge(&memcg->memsw, nr_pages);
-               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-                       page_counter_uncharge(&memcg->kmem, nr_kmem);
-               memcg_oom_recover(memcg);
+                       page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+                       page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+               memcg_oom_recover(ug->memcg);
        }
 
        local_irq_save(flags);
-       __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-       __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-       __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-       __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-       __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-       __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-       memcg_check_events(memcg, dummy_page);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+       __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+       __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+       __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+       memcg_check_events(ug->memcg, ug->dummy_page);
        local_irq_restore(flags);
 
-       if (!mem_cgroup_is_root(memcg))
-               css_put_many(&memcg->css, nr_pages);
+       if (!mem_cgroup_is_root(ug->memcg))
+               css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+       if (!page->mem_cgroup)
+               return;
+
+       /*
+        * Nobody should be changing or seriously looking at
+        * page->mem_cgroup at this point, we have fully
+        * exclusive access to the page.
+        */
+
+       if (ug->memcg != page->mem_cgroup) {
+               if (ug->memcg) {
+                       uncharge_batch(ug);
+                       uncharge_gather_clear(ug);
+               }
+               ug->memcg = page->mem_cgroup;
+       }
+
+       if (!PageKmemcg(page)) {
+               unsigned int nr_pages = 1;
+
+               if (PageTransHuge(page)) {
+                       nr_pages <<= compound_order(page);
+                       ug->nr_huge += nr_pages;
+               }
+               if (PageAnon(page))
+                       ug->nr_anon += nr_pages;
+               else {
+                       ug->nr_file += nr_pages;
+                       if (PageSwapBacked(page))
+                               ug->nr_shmem += nr_pages;
+               }
+               ug->pgpgout++;
+       } else {
+               ug->nr_kmem += 1 << compound_order(page);
+               __ClearPageKmemcg(page);
+       }
+
+       ug->dummy_page = page;
+       page->mem_cgroup = NULL;
 }
 
 static void uncharge_list(struct list_head *page_list)
 {
-       struct mem_cgroup *memcg = NULL;
-       unsigned long nr_shmem = 0;
-       unsigned long nr_anon = 0;
-       unsigned long nr_file = 0;
-       unsigned long nr_huge = 0;
-       unsigned long nr_kmem = 0;
-       unsigned long pgpgout = 0;
+       struct uncharge_gather ug;
        struct list_head *next;
-       struct page *page;
+
+       uncharge_gather_clear(&ug);
 
        /*
         * Note that the list can be a single page->lru; hence the
@@ -5558,57 +5692,16 @@ static void uncharge_list(struct list_head *page_list)
         */
        next = page_list->next;
        do {
+               struct page *page;
+
                page = list_entry(next, struct page, lru);
                next = page->lru.next;
 
-               VM_BUG_ON_PAGE(PageLRU(page), page);
-               VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
-               if (!page->mem_cgroup)
-                       continue;
-
-               /*
-                * Nobody should be changing or seriously looking at
-                * page->mem_cgroup at this point, we have fully
-                * exclusive access to the page.
-                */
-
-               if (memcg != page->mem_cgroup) {
-                       if (memcg) {
-                               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                                              nr_kmem, nr_huge, nr_shmem, page);
-                               pgpgout = nr_anon = nr_file = nr_kmem = 0;
-                               nr_huge = nr_shmem = 0;
-                       }
-                       memcg = page->mem_cgroup;
-               }
-
-               if (!PageKmemcg(page)) {
-                       unsigned int nr_pages = 1;
-
-                       if (PageTransHuge(page)) {
-                               nr_pages <<= compound_order(page);
-                               nr_huge += nr_pages;
-                       }
-                       if (PageAnon(page))
-                               nr_anon += nr_pages;
-                       else {
-                               nr_file += nr_pages;
-                               if (PageSwapBacked(page))
-                                       nr_shmem += nr_pages;
-                       }
-                       pgpgout++;
-               } else {
-                       nr_kmem += 1 << compound_order(page);
-                       __ClearPageKmemcg(page);
-               }
-
-               page->mem_cgroup = NULL;
+               uncharge_page(page, &ug);
        } while (next != page_list);
 
-       if (memcg)
-               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                              nr_kmem, nr_huge, nr_shmem, page);
+       if (ug.memcg)
+               uncharge_batch(&ug);
 }
 
 /**
@@ -5620,6 +5713,8 @@ static void uncharge_list(struct list_head *page_list)
  */
 void mem_cgroup_uncharge(struct page *page)
 {
+       struct uncharge_gather ug;
+
        if (mem_cgroup_disabled())
                return;
 
@@ -5627,8 +5722,9 @@ void mem_cgroup_uncharge(struct page *page)
        if (!page->mem_cgroup)
                return;
 
-       INIT_LIST_HEAD(&page->lru);
-       uncharge_list(&page->lru);
+       uncharge_gather_clear(&ug);
+       uncharge_page(page, &ug);
+       uncharge_batch(&ug);
 }
 
 /**
@@ -5793,8 +5889,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 
        this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
 
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       css_put_many(&memcg->css, nr_pages);
+       refill_stock(memcg, nr_pages);
 }
 
 static int __init cgroup_memory(char *s)
@@ -5887,6 +5982,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
        struct mem_cgroup *memcg, *swap_memcg;
+       unsigned int nr_entries;
        unsigned short oldid;
 
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5907,19 +6003,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * ancestor for the swap instead and transfer the memory+swap charge.
         */
        swap_memcg = mem_cgroup_id_get_online(memcg);
-       oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+       nr_entries = hpage_nr_pages(page);
+       /* Get references for the tail pages, too */
+       if (nr_entries > 1)
+               mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+       oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+                                  nr_entries);
        VM_BUG_ON_PAGE(oldid, page);
-       mem_cgroup_swap_statistics(swap_memcg, 1);
+       mem_cgroup_swap_statistics(swap_memcg, nr_entries);
 
        page->mem_cgroup = NULL;
 
        if (!mem_cgroup_is_root(memcg))
-               page_counter_uncharge(&memcg->memory, 1);
+               page_counter_uncharge(&memcg->memory, nr_entries);
 
        if (memcg != swap_memcg) {
                if (!mem_cgroup_is_root(swap_memcg))
-                       page_counter_charge(&swap_memcg->memsw, 1);
-               page_counter_uncharge(&memcg->memsw, 1);
+                       page_counter_charge(&swap_memcg->memsw, nr_entries);
+               page_counter_uncharge(&memcg->memsw, nr_entries);
        }
 
        /*
@@ -5929,7 +6030,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, false, -1);
+       mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+                                    -nr_entries);
        memcg_check_events(memcg, page);
 
        if (!mem_cgroup_is_root(memcg))