mm: memcontrol: use per-cpu stocks for socket memory uncharging

[sfrench/cifs-2.6.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 6532b219b22239a268783d399a7ffe0385ee4ccf..ca83f3854e4fcf2272474d9200cebd30c75eba82 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1792,6 +1792,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
         }
         stock->nr_pages += nr_pages;
  
+       if (stock->nr_pages > CHARGE_BATCH)
+               drain_stock(stock);
+
         local_irq_restore(flags);
  }
  
@@ -4414,12 +4417,13 @@ enum mc_target_type {
         MC_TARGET_NONE = 0,
         MC_TARGET_PAGE,
         MC_TARGET_SWAP,
+       MC_TARGET_DEVICE,
  };
  
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
                                                 unsigned long addr, pte_t ptent)
  {
-       struct page *page = vm_normal_page(vma, addr, ptent);
+       struct page *page = _vm_normal_page(vma, addr, ptent, true);
  
         if (!page || !page_mapped(page))
                 return NULL;
@@ -4436,7 +4440,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
         return page;
  }
  
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
                         pte_t ptent, swp_entry_t *entry)
  {
@@ -4445,6 +4449,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  
         if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                 return NULL;
+
+       /*
+        * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+        * a device and because they are not accessible by CPU they are store
+        * as special swap entry in the CPU page table.
+        */
+       if (is_device_private_entry(ent)) {
+               page = device_private_entry_to_page(ent);
+               /*
+                * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+                * a refcount of 1 when free (unlike normal page)
+                */
+               if (!page_ref_add_unless(page, 1, 1))
+                       return NULL;
+               return page;
+       }
+
         /*
          * Because lookup_swap_cache() updates some statistics counter,
          * we call find_get_page() with swapper_space directly.
@@ -4605,6 +4626,13 @@ out:
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *     For now we such page is charge like a regular page would be as for all
+ *     intent and purposes it is just special memory taking the place of a
+ *     regular page.
+ *
+ *     See Documentations/vm/hmm.txt and include/linux/hmm.h
   *
   * Called with pte lock held.
   */
@@ -4633,6 +4661,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                  */
                 if (page->mem_cgroup == mc.from) {
                         ret = MC_TARGET_PAGE;
+                       if (is_device_private_page(page) ||
+                           is_device_public_page(page))
+                               ret = MC_TARGET_DEVICE;
                         if (target)
                                 target->page = page;
                 }
@@ -4664,6 +4695,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
         struct page *page = NULL;
         enum mc_target_type ret = MC_TARGET_NONE;
  
+       if (unlikely(is_swap_pmd(pmd))) {
+               VM_BUG_ON(thp_migration_supported() &&
+                                 !is_pmd_migration_entry(pmd));
+               return ret;
+       }
         page = pmd_page(pmd);
         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
         if (!(mc.flags & MOVE_ANON))
@@ -4695,6 +4731,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  
         ptl = pmd_trans_huge_lock(pmd, vma);
         if (ptl) {
+               /*
+                * Note their can not be MC_TARGET_DEVICE for now as we do not
+                * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+                * MEMORY_DEVICE_PRIVATE but this might change.
+                */
                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                         mc.precharge += HPAGE_PMD_NR;
                 spin_unlock(ptl);
@@ -4910,6 +4951,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                 putback_lru_page(page);
                         }
                         put_page(page);
+               } else if (target_type == MC_TARGET_DEVICE) {
+                       page = target.page;
+                       if (!mem_cgroup_move_account(page, true,
+                                                    mc.from, mc.to)) {
+                               mc.precharge -= HPAGE_PMD_NR;
+                               mc.moved_charge += HPAGE_PMD_NR;
+                       }
+                       put_page(page);
                 }
                 spin_unlock(ptl);
                 return 0;
@@ -4921,12 +4970,16 @@ retry:
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         for (; addr != end; addr += PAGE_SIZE) {
                 pte_t ptent = *(pte++);
+               bool device = false;
                 swp_entry_t ent;
  
                 if (!mc.precharge)
                         break;
  
                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
+               case MC_TARGET_DEVICE:
+                       device = true;
+                       /* fall through */
                 case MC_TARGET_PAGE:
                         page = target.page;
                         /*
@@ -4937,7 +4990,7 @@ retry:
                          */
                         if (PageTransCompound(page))
                                 goto put;
-                       if (isolate_lru_page(page))
+                       if (!device && isolate_lru_page(page))
                                 goto put;
                         if (!mem_cgroup_move_account(page, false,
                                                 mc.from, mc.to)) {
@@ -4945,7 +4998,8 @@ retry:
                                 /* we uncharge from mc.from later. */
                                 mc.moved_charge++;
                         }
-                       putback_lru_page(page);
+                       if (!device)
+                               putback_lru_page(page);
  put:                   /* get_mctgt_type() gets the page */
                         put_page(page);
                         break;
@@ -5535,48 +5589,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
         cancel_charge(memcg, nr_pages);
  }
  
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-                          unsigned long nr_anon, unsigned long nr_file,
-                          unsigned long nr_kmem, unsigned long nr_huge,
-                          unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+       struct mem_cgroup *memcg;
+       unsigned long pgpgout;
+       unsigned long nr_anon;
+       unsigned long nr_file;
+       unsigned long nr_kmem;
+       unsigned long nr_huge;
+       unsigned long nr_shmem;
+       struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
+{
+       memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
  {
-       unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+       unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
         unsigned long flags;
  
-       if (!mem_cgroup_is_root(memcg)) {
-               page_counter_uncharge(&memcg->memory, nr_pages);
+       if (!mem_cgroup_is_root(ug->memcg)) {
+               page_counter_uncharge(&ug->memcg->memory, nr_pages);
                 if (do_memsw_account())
-                       page_counter_uncharge(&memcg->memsw, nr_pages);
-               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-                       page_counter_uncharge(&memcg->kmem, nr_kmem);
-               memcg_oom_recover(memcg);
+                       page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+                       page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+               memcg_oom_recover(ug->memcg);
         }
  
         local_irq_save(flags);
-       __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-       __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-       __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-       __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-       __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-       __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-       memcg_check_events(memcg, dummy_page);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+       __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+       __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+       __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+       memcg_check_events(ug->memcg, ug->dummy_page);
         local_irq_restore(flags);
  
-       if (!mem_cgroup_is_root(memcg))
-               css_put_many(&memcg->css, nr_pages);
+       if (!mem_cgroup_is_root(ug->memcg))
+               css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+       if (!page->mem_cgroup)
+               return;
+
+       /*
+        * Nobody should be changing or seriously looking at
+        * page->mem_cgroup at this point, we have fully
+        * exclusive access to the page.
+        */
+
+       if (ug->memcg != page->mem_cgroup) {
+               if (ug->memcg) {
+                       uncharge_batch(ug);
+                       uncharge_gather_clear(ug);
+               }
+               ug->memcg = page->mem_cgroup;
+       }
+
+       if (!PageKmemcg(page)) {
+               unsigned int nr_pages = 1;
+
+               if (PageTransHuge(page)) {
+                       nr_pages <<= compound_order(page);
+                       ug->nr_huge += nr_pages;
+               }
+               if (PageAnon(page))
+                       ug->nr_anon += nr_pages;
+               else {
+                       ug->nr_file += nr_pages;
+                       if (PageSwapBacked(page))
+                               ug->nr_shmem += nr_pages;
+               }
+               ug->pgpgout++;
+       } else {
+               ug->nr_kmem += 1 << compound_order(page);
+               __ClearPageKmemcg(page);
+       }
+
+       ug->dummy_page = page;
+       page->mem_cgroup = NULL;
  }
  
  static void uncharge_list(struct list_head *page_list)
  {
-       struct mem_cgroup *memcg = NULL;
-       unsigned long nr_shmem = 0;
-       unsigned long nr_anon = 0;
-       unsigned long nr_file = 0;
-       unsigned long nr_huge = 0;
-       unsigned long nr_kmem = 0;
-       unsigned long pgpgout = 0;
+       struct uncharge_gather ug;
         struct list_head *next;
-       struct page *page;
+
+       uncharge_gather_clear(&ug);
  
         /*
          * Note that the list can be a single page->lru; hence the
@@ -5584,57 +5692,16 @@ static void uncharge_list(struct list_head *page_list)
          */
         next = page_list->next;
         do {
+               struct page *page;
+
                 page = list_entry(next, struct page, lru);
                 next = page->lru.next;
  
-               VM_BUG_ON_PAGE(PageLRU(page), page);
-               VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
-               if (!page->mem_cgroup)
-                       continue;
-
-               /*
-                * Nobody should be changing or seriously looking at
-                * page->mem_cgroup at this point, we have fully
-                * exclusive access to the page.
-                */
-
-               if (memcg != page->mem_cgroup) {
-                       if (memcg) {
-                               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                                              nr_kmem, nr_huge, nr_shmem, page);
-                               pgpgout = nr_anon = nr_file = nr_kmem = 0;
-                               nr_huge = nr_shmem = 0;
-                       }
-                       memcg = page->mem_cgroup;
-               }
-
-               if (!PageKmemcg(page)) {
-                       unsigned int nr_pages = 1;
-
-                       if (PageTransHuge(page)) {
-                               nr_pages <<= compound_order(page);
-                               nr_huge += nr_pages;
-                       }
-                       if (PageAnon(page))
-                               nr_anon += nr_pages;
-                       else {
-                               nr_file += nr_pages;
-                               if (PageSwapBacked(page))
-                                       nr_shmem += nr_pages;
-                       }
-                       pgpgout++;
-               } else {
-                       nr_kmem += 1 << compound_order(page);
-                       __ClearPageKmemcg(page);
-               }
-
-               page->mem_cgroup = NULL;
+               uncharge_page(page, &ug);
         } while (next != page_list);
  
-       if (memcg)
-               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                              nr_kmem, nr_huge, nr_shmem, page);
+       if (ug.memcg)
+               uncharge_batch(&ug);
  }
  
  /**
@@ -5646,6 +5713,8 @@ static void uncharge_list(struct list_head *page_list)
   */
  void mem_cgroup_uncharge(struct page *page)
  {
+       struct uncharge_gather ug;
+
         if (mem_cgroup_disabled())
                 return;
  
@@ -5653,8 +5722,9 @@ void mem_cgroup_uncharge(struct page *page)
         if (!page->mem_cgroup)
                 return;
  
-       INIT_LIST_HEAD(&page->lru);
-       uncharge_list(&page->lru);
+       uncharge_gather_clear(&ug);
+       uncharge_page(page, &ug);
+       uncharge_batch(&ug);
  }
  
  /**
@@ -5819,8 +5889,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
  
         this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
  
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       css_put_many(&memcg->css, nr_pages);
+       refill_stock(memcg, nr_pages);
  }
  
  static int __init cgroup_memory(char *s)