Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Jul 2021 00:17:24 +0000 (17:17 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Jul 2021 00:17:24 +0000 (17:17 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Jul 2021 00:17:24 +0000 (17:17 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Jul 2021 00:17:24 +0000 (17:17 -0700)
diff --combined include/linux/memcontrol.h

index 6d66037be6466881156af7efa185b9f86426d837,8ef51c58f470350863bc0a7c07214d4ec387e6af..bfe5c486f4add865bcef9ed13f087dab3eb0eba8
--- 1/include/linux/memcontrol.h
--- 2/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@@ -76,27 -76,10 +76,27 @@@ enum mem_cgroup_events_target 
   };
   
   struct memcg_vmstats_percpu {
- -      long stat[MEMCG_NR_STAT];
- -      unsigned long events[NR_VM_EVENT_ITEMS];
- -      unsigned long nr_page_events;
- -      unsigned long targets[MEM_CGROUP_NTARGETS];
+ +      /* Local (CPU and cgroup) page state & events */
+ +      long                    state[MEMCG_NR_STAT];
+ +      unsigned long           events[NR_VM_EVENT_ITEMS];
+ +
+ +      /* Delta calculation for lockless upward propagation */
+ +      long                    state_prev[MEMCG_NR_STAT];
+ +      unsigned long           events_prev[NR_VM_EVENT_ITEMS];
+ +
+ +      /* Cgroup1: threshold notifications & softlimit tree updates */
+ +      unsigned long           nr_page_events;
+ +      unsigned long           targets[MEM_CGROUP_NTARGETS];
+ +};
+ +
+ +struct memcg_vmstats {
+ +      /* Aggregated (CPU and subtree) page state & events */
+ +      long                    state[MEMCG_NR_STAT];
+ +      unsigned long           events[NR_VM_EVENT_ITEMS];
+ +
+ +      /* Pending child counts during tree propagation */
+ +      long                    state_pending[MEMCG_NR_STAT];
+ +      unsigned long           events_pending[NR_VM_EVENT_ITEMS];
   };
   
   struct mem_cgroup_reclaim_iter {
@@@ -114,13 -97,12 +114,13 @@@ struct batched_lruvec_stat 
   };
   
   /*
- - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
- - * which have elements charged to this memcg.
+ + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ + * shrinkers, which have elements charged to this memcg.
    */
- -struct memcg_shrinker_map {
+ +struct shrinker_info {
         struct rcu_head rcu;
- -      unsigned long map[];
+ +      atomic_long_t *nr_deferred;
+ +      unsigned long *map;
   };
   
   /*
@@@ -146,7 -128,7 +146,7 @@@ struct mem_cgroup_per_node 
   
         struct mem_cgroup_reclaim_iter  iter;
   
- -      struct memcg_shrinker_map __rcu *shrinker_map;
+ +      struct shrinker_info __rcu      *shrinker_info;
   
         struct rb_node          tree_node;      /* RB tree node */
         unsigned long           usage_in_excess;/* Set to the value by which */
@@@ -192,7 -174,7 +192,7 @@@ enum memcg_kmem_state 
   struct memcg_padding {
         char x[0];
   } ____cacheline_internodealigned_in_smp;
- -#define MEMCG_PADDING(name)      struct memcg_padding name;
+ +#define MEMCG_PADDING(name)      struct memcg_padding name
   #else
   #define MEMCG_PADDING(name)
   #endif
@@@ -305,8 -287,8 +305,8 @@@ struct mem_cgroup 
   
         MEMCG_PADDING(_pad1_);
   
- -      atomic_long_t           vmstats[MEMCG_NR_STAT];
- -      atomic_long_t           vmevents[NR_VM_EVENT_ITEMS];
+ +      /* memory.stat */
+ +      struct memcg_vmstats    vmstats;
   
         /* memory.events */
         atomic_long_t           memory_events[MEMCG_NR_MEMORY_EVENTS];
@@@ -333,6 -315,10 +333,6 @@@
         atomic_t                moving_account;
         struct task_struct      *move_lock_task;
   
- -      /* Legacy local VM stats and events */
- -      struct memcg_vmstats_percpu __percpu *vmstats_local;
- -
- -      /* Subtree VM stats and events (batched updates) */
         struct memcg_vmstats_percpu __percpu *vmstats_percpu;
   
   #ifdef CONFIG_CGROUP_WRITEBACK
@@@ -349,7 -335,8 +349,7 @@@
         struct deferred_split deferred_split_queue;
   #endif
   
- -      struct mem_cgroup_per_node *nodeinfo[0];
- -      /* WARNING: nodeinfo must be the last member here */
+ +      struct mem_cgroup_per_node *nodeinfo[];
   };
   
   /*
@@@ -371,62 -358,6 +371,62 @@@ enum page_memcg_data_flags 
   
   #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
   
+ +static inline bool PageMemcgKmem(struct page *page);
+ +
+ +/*
+ + * After the initialization objcg->memcg is always pointing at
+ + * a valid memcg, but can be atomically swapped to the parent memcg.
+ + *
+ + * The caller must ensure that the returned memcg won't be released:
+ + * e.g. acquire the rcu_read_lock or css_set_lock.
+ + */
+ +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+ +{
+ +      return READ_ONCE(objcg->memcg);
+ +}
+ +
+ +/*
+ + * __page_memcg - get the memory cgroup associated with a non-kmem page
+ + * @page: a pointer to the page struct
+ + *
+ + * Returns a pointer to the memory cgroup associated with the page,
+ + * or NULL. This function assumes that the page is known to have a
+ + * proper memory cgroup pointer. It's not safe to call this function
+ + * against some type of pages, e.g. slab pages or ex-slab pages or
+ + * kmem pages.
+ + */
+ +static inline struct mem_cgroup *__page_memcg(struct page *page)
+ +{
+ +      unsigned long memcg_data = page->memcg_data;
+ +
+ +      VM_BUG_ON_PAGE(PageSlab(page), page);
+ +      VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ +      VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+ +
+ +      return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ +}
+ +
+ +/*
+ + * __page_objcg - get the object cgroup associated with a kmem page
+ + * @page: a pointer to the page struct
+ + *
+ + * Returns a pointer to the object cgroup associated with the page,
+ + * or NULL. This function assumes that the page is known to have a
+ + * proper object cgroup pointer. It's not safe to call this function
+ + * against some type of pages, e.g. slab pages or ex-slab pages or
+ + * LRU pages.
+ + */
+ +static inline struct obj_cgroup *__page_objcg(struct page *page)
+ +{
+ +      unsigned long memcg_data = page->memcg_data;
+ +
+ +      VM_BUG_ON_PAGE(PageSlab(page), page);
+ +      VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ +      VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+ +
+ +      return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ +}
+ +
   /*
    * page_memcg - get the memory cgroup associated with a page
    * @page: a pointer to the page struct
@@@ -436,23 -367,20 +436,23 @@@
    * proper memory cgroup pointer. It's not safe to call this function
    * against some type of pages, e.g. slab pages or ex-slab pages.
    *
- - * Any of the following ensures page and memcg binding stability:
+ + * For a non-kmem page any of the following ensures page and memcg binding
+ + * stability:
+ + *
    * - the page lock
    * - LRU isolation
    * - lock_page_memcg()
    * - exclusive reference
+ + *
+ + * For a kmem page a caller should hold an rcu read lock to protect memcg
+ + * associated with a kmem page from being released.
    */
   static inline struct mem_cgroup *page_memcg(struct page *page)
   {
- -      unsigned long memcg_data = page->memcg_data;
- -
- -      VM_BUG_ON_PAGE(PageSlab(page), page);
- -      VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
- -
- -      return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ +      if (PageMemcgKmem(page))
+ +              return obj_cgroup_memcg(__page_objcg(page));
+ +      else
+ +              return __page_memcg(page);
   }
   
   /*
@@@ -466,19 -394,11 +466,19 @@@
    */
   static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
   {
+ +      unsigned long memcg_data = READ_ONCE(page->memcg_data);
+ +
         VM_BUG_ON_PAGE(PageSlab(page), page);
         WARN_ON_ONCE(!rcu_read_lock_held());
   
- -      return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
- -                                   ~MEMCG_DATA_FLAGS_MASK);
+ +      if (memcg_data & MEMCG_DATA_KMEM) {
+ +              struct obj_cgroup *objcg;
+ +
+ +              objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ +              return obj_cgroup_memcg(objcg);
+ +      }
+ +
+ +      return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
   }
   
   /*
@@@ -486,21 -406,15 +486,21 @@@
    * @page: a pointer to the page struct
    *
    * Returns a pointer to the memory cgroup associated with the page,
- - * or NULL. This function unlike page_memcg() can take any  page
+ + * or NULL. This function unlike page_memcg() can take any page
    * as an argument. It has to be used in cases when it's not known if a page
- - * has an associated memory cgroup pointer or an object cgroups vector.
+ + * has an associated memory cgroup pointer or an object cgroups vector or
+ + * an object cgroup.
+ + *
+ + * For a non-kmem page any of the following ensures page and memcg binding
+ + * stability:
    *
- - * Any of the following ensures page and memcg binding stability:
    * - the page lock
    * - LRU isolation
    * - lock_page_memcg()
    * - exclusive reference
+ + *
+ + * For a kmem page a caller should hold an rcu read lock to protect memcg
+ + * associated with a kmem page from being released.
    */
   static inline struct mem_cgroup *page_memcg_check(struct page *page)
   {
@@@ -513,17 -427,9 +513,17 @@@
         if (memcg_data & MEMCG_DATA_OBJCGS)
                 return NULL;
   
+ +      if (memcg_data & MEMCG_DATA_KMEM) {
+ +              struct obj_cgroup *objcg;
+ +
+ +              objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ +              return obj_cgroup_memcg(objcg);
+ +      }
+ +
         return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
   }
   
+ +#ifdef CONFIG_MEMCG_KMEM
   /*
    * PageMemcgKmem - check if the page has MemcgKmem flag set
    * @page: a pointer to the page struct
@@@ -538,6 -444,7 +538,6 @@@ static inline bool PageMemcgKmem(struc
         return page->memcg_data & MEMCG_DATA_KMEM;
   }
   
- -#ifdef CONFIG_MEMCG_KMEM
   /*
    * page_objcgs - get the object cgroups vector associated with a page
    * @page: a pointer to the page struct
@@@ -579,11 -486,6 +579,11 @@@ static inline struct obj_cgroup **page_
   }
   
   #else
+ +static inline bool PageMemcgKmem(struct page *page)
+ +{
+ +      return false;
+ +}
+ +
   static inline struct obj_cgroup **page_objcgs(struct page *page)
   {
         return NULL;
@@@ -694,15 -596,18 +694,15 @@@ static inline bool mem_cgroup_below_min
   }
   
   int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+ +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+ +                                gfp_t gfp, swp_entry_t entry);
+ +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
   
   void mem_cgroup_uncharge(struct page *page);
   void mem_cgroup_uncharge_list(struct list_head *page_list);
   
   void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
   
- -static struct mem_cgroup_per_node *
- -mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
- -{
- -      return memcg->nodeinfo[nid];
- -}
- -
   /**
    * mem_cgroup_lruvec - get the lru list vector for a memcg & node
    * @memcg: memcg of the wanted lruvec
@@@ -726,7 -631,7 +726,7 @@@ static inline struct lruvec *mem_cgroup
         if (!memcg)
                 memcg = root_mem_cgroup;
   
- -      mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ +      mz = memcg->nodeinfo[pgdat->node_id];
         lruvec = &mz->lruvec;
   out:
         /*
@@@ -742,18 -647,35 +742,18 @@@
   /**
    * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
    * @page: the page
- - * @pgdat: pgdat of the page
    *
    * This function relies on page->mem_cgroup being stable.
    */
- -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
- -                                              struct pglist_data *pgdat)
+ +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
   {
+ +      pg_data_t *pgdat = page_pgdat(page);
         struct mem_cgroup *memcg = page_memcg(page);
   
         VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
         return mem_cgroup_lruvec(memcg, pgdat);
   }
   
- -static inline bool lruvec_holds_page_lru_lock(struct page *page,
- -                                            struct lruvec *lruvec)
- -{
- -      pg_data_t *pgdat = page_pgdat(page);
- -      const struct mem_cgroup *memcg;
- -      struct mem_cgroup_per_node *mz;
- -
- -      if (mem_cgroup_disabled())
- -              return lruvec == &pgdat->__lruvec;
- -
- -      mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- -      memcg = page_memcg(page) ? : root_mem_cgroup;
- -
- -      return lruvec->pgdat == pgdat && mz->memcg == memcg;
- -}
- -
   struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
   
   struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@@ -786,15 -708,21 +786,15 @@@ static inline void obj_cgroup_get(struc
         percpu_ref_get(&objcg->refcnt);
   }
   
- -static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+ +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
+ +                                     unsigned long nr)
   {
- -      percpu_ref_put(&objcg->refcnt);
+ +      percpu_ref_get_many(&objcg->refcnt, nr);
   }
   
- -/*
- - * After the initialization objcg->memcg is always pointing at
- - * a valid memcg, but can be atomically swapped to the parent memcg.
- - *
- - * The caller must ensure that the returned memcg won't be released:
- - * e.g. acquire the rcu_read_lock or css_set_lock.
- - */
- -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+ +static inline void obj_cgroup_put(struct obj_cgroup *objcg)
   {
- -      return READ_ONCE(objcg->memcg);
+ +      percpu_ref_put(&objcg->refcnt);
   }
   
   static inline void mem_cgroup_put(struct mem_cgroup *memcg)
@@@ -939,9 -867,43 +939,9 @@@ void mem_cgroup_print_oom_group(struct 
   extern bool cgroup_memory_noswap;
   #endif
   
- -struct mem_cgroup *lock_page_memcg(struct page *page);
- -void __unlock_page_memcg(struct mem_cgroup *memcg);
+ +void lock_page_memcg(struct page *page);
   void unlock_page_memcg(struct page *page);
   
- -/*
- - * idx can be of type enum memcg_stat_item or node_stat_item.
- - * Keep in sync with memcg_exact_page_state().
- - */
- -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
- -{
- -      long x = atomic_long_read(&memcg->vmstats[idx]);
- -#ifdef CONFIG_SMP
- -      if (x < 0)
- -              x = 0;
- -#endif
- -      return x;
- -}
- -
- -/*
- - * idx can be of type enum memcg_stat_item or node_stat_item.
- - * Keep in sync with memcg_exact_page_state().
- - */
- -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
- -                                                 int idx)
- -{
- -      long x = 0;
- -      int cpu;
- -
- -      for_each_possible_cpu(cpu)
- -              x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
- -#ifdef CONFIG_SMP
- -      if (x < 0)
- -              x = 0;
- -#endif
- -      return x;
- -}
- -
   void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
   
   /* idx can be of type enum memcg_stat_item or node_stat_item */
@@@ -1017,6 -979,10 +1017,6 @@@ static inline void mod_memcg_lruvec_sta
         local_irq_restore(flags);
   }
   
- -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
- -                                              gfp_t gfp_mask,
- -                                              unsigned long *total_scanned);
- -
   void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                           unsigned long count);
   
@@@ -1097,15 -1063,13 +1097,15 @@@ static inline void memcg_memory_event_m
   
   void split_page_memcg(struct page *head, unsigned int nr);
   
+ +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ +                                              gfp_t gfp_mask,
+ +                                              unsigned long *total_scanned);
+ +
   #else /* CONFIG_MEMCG */
   
   #define MEM_CGROUP_ID_SHIFT   0
   #define MEM_CGROUP_ID_MAX     0
   
- -struct mem_cgroup;
- -
   static inline struct mem_cgroup *page_memcg(struct page *page)
   {
         return NULL;
@@@ -1175,16 -1139,6 +1175,16 @@@ static inline int mem_cgroup_charge(str
         return 0;
   }
   
+ +static inline int mem_cgroup_swapin_charge_page(struct page *page,
+ +                      struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+ +{
+ +      return 0;
+ +}
+ +
+ +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+ +{
+ +}
+ +
   static inline void mem_cgroup_uncharge(struct page *page)
   {
   }
@@@ -1203,15 -1157,18 +1203,15 @@@ static inline struct lruvec *mem_cgroup
         return &pgdat->__lruvec;
   }
   
- -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
- -                                                  struct pglist_data *pgdat)
+ +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
   {
+ +      pg_data_t *pgdat = page_pgdat(page);
+ +
         return &pgdat->__lruvec;
   }
   
- -static inline bool lruvec_holds_page_lru_lock(struct page *page,
- -                                            struct lruvec *lruvec)
+ +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
   {
- -      pg_data_t *pgdat = page_pgdat(page);
- -
- -      return lruvec == &pgdat->__lruvec;
   }
   
   static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
@@@ -1230,12 -1187,6 +1230,12 @@@ static inline struct mem_cgroup *get_me
         return NULL;
   }
   
+ +static inline
+ +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
+ +{
+ +      return NULL;
+ +}
+ +
   static inline void mem_cgroup_put(struct mem_cgroup *memcg)
   {
   }
@@@ -1338,7 -1289,12 +1338,7 @@@ mem_cgroup_print_oom_meminfo(struct mem
   {
   }
   
- -static inline struct mem_cgroup *lock_page_memcg(struct page *page)
- -{
- -      return NULL;
- -}
- -
- -static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
+ +static inline void lock_page_memcg(struct page *page)
   {
   }
   
@@@ -1378,6 -1334,17 +1378,6 @@@ static inline void mem_cgroup_print_oom
   {
   }
   
- -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
- -{
- -      return 0;
- -}
- -
- -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
- -                                                 int idx)
- -{
- -      return 0;
- -}
- -
   static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                      int idx,
                                      int nr)
@@@ -1423,6 -1390,18 +1423,6 @@@ static inline void mod_lruvec_kmem_stat
         mod_node_page_state(page_pgdat(page), idx, val);
   }
   
- -static inline
- -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
- -                                          gfp_t gfp_mask,
- -                                          unsigned long *total_scanned)
- -{
- -      return 0;
- -}
- -
- -static inline void split_page_memcg(struct page *head, unsigned int nr)
- -{
- -}
- -
   static inline void count_memcg_events(struct mem_cgroup *memcg,
                                       enum vm_event_item idx,
                                       unsigned long count)
@@@ -1445,17 -1424,9 +1445,17 @@@ void count_memcg_event_mm(struct mm_str
   {
   }
   
- -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+ +static inline void split_page_memcg(struct page *head, unsigned int nr)
   {
   }
+ +
+ +static inline
+ +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ +                                          gfp_t gfp_mask,
+ +                                          unsigned long *total_scanned)
+ +{
+ +      return 0;
+ +}
   #endif /* CONFIG_MEMCG */
   
   static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
@@@ -1497,19 -1468,12 +1497,19 @@@ static inline void unlock_page_lruvec_i
         spin_unlock_irqrestore(&lruvec->lru_lock, flags);
   }
   
+ +/* Test requires a stable page->memcg binding, see page_memcg() */
+ +static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec)
+ +{
+ +      return lruvec_pgdat(lruvec) == page_pgdat(page) &&
+ +             lruvec_memcg(lruvec) == page_memcg(page);
+ +}
+ +
   /* Don't lock again iff page's lruvec locked */
   static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
                 struct lruvec *locked_lruvec)
   {
         if (locked_lruvec) {
- -              if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+ +              if (page_matches_lruvec(page, locked_lruvec))
                         return locked_lruvec;
   
                 unlock_page_lruvec_irq(locked_lruvec);
@@@ -1523,7 -1487,7 +1523,7 @@@ static inline struct lruvec *relock_pag
                 struct lruvec *locked_lruvec, unsigned long *flags)
   {
         if (locked_lruvec) {
- -              if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+ +              if (page_matches_lruvec(page, locked_lruvec))
                         return locked_lruvec;
   
                 unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
@@@ -1599,10 -1563,10 +1599,10 @@@ static inline bool mem_cgroup_under_soc
         return false;
   }
   
- -extern int memcg_expand_shrinker_maps(int new_id);
- -
- -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- -                                 int nid, int shrinker_id);
+ +int alloc_shrinker_info(struct mem_cgroup *memcg);
+ +void free_shrinker_info(struct mem_cgroup *memcg);
+ +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
+ +void reparent_shrinker_deferred(struct mem_cgroup *memcg);
   #else
   #define mem_cgroup_sockets_enabled 0
   static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@@ -1612,13 -1576,14 +1612,14 @@@ static inline bool mem_cgroup_under_soc
         return false;
   }
   
- -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- -                                        int nid, int shrinker_id)
+ +static inline void set_shrinker_bit(struct mem_cgroup *memcg,
+ +                                  int nid, int shrinker_id)
   {
   }
   #endif
   
   #ifdef CONFIG_MEMCG_KMEM
+ bool mem_cgroup_kmem_disabled(void);
   int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
   void __memcg_kmem_uncharge_page(struct page *page, int order);
   
@@@ -1672,6 -1637,10 +1673,10 @@@ static inline int memcg_cache_id(struc
   struct mem_cgroup *mem_cgroup_from_obj(void *p);
   
   #else
+ static inline bool mem_cgroup_kmem_disabled(void)
+ {
+       return true;
+ }
   
   static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                          int order)
diff --combined mm/memcontrol.c

index 4ee243ce613595c663df6403dedc1c374d90c0d5,1fa9b00ec71d90c56e12901f169f4c13bd9c1016..b80aae448a49da5bb71ca17822861decae362c4e
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -78,17 -78,16 +78,17 @@@ struct mem_cgroup *root_mem_cgroup __re
   
   /* Active memory cgroup to use from an interrupt context */
   DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+ +EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
   
   /* Socket memory accounting disabled? */
- static bool cgroup_memory_nosocket;
+ static bool cgroup_memory_nosocket __ro_after_init;
   
   /* Kernel memory accounting disabled? */
- bool cgroup_memory_nokmem;
- -static bool cgroup_memory_nokmem __ro_after_init;
++bool cgroup_memory_nokmem __ro_after_init;
   
   /* Whether the swap controller is active */
   #ifdef CONFIG_MEMCG_SWAP
- bool cgroup_memory_noswap __read_mostly;
+ bool cgroup_memory_noswap __ro_after_init;
   #else
   #define cgroup_memory_noswap          1
   #endif
@@@ -216,7 -215,7 +216,7 @@@ enum res_type 
   #define MEMFILE_PRIVATE(x, val)       ((x) << 16 | (val))
   #define MEMFILE_TYPE(val)     ((val) >> 16 & 0xffff)
   #define MEMFILE_ATTR(val)     ((val) & 0xffff)
- -/* Used for OOM nofiier */
+ +/* Used for OOM notifier */
   #define OOM_CONTROL           (0)
   
   /*
@@@ -256,12 -255,20 +256,17 @@@ struct cgroup_subsys_state *vmpressure_
   #ifdef CONFIG_MEMCG_KMEM
   extern spinlock_t css_set_lock;
   
- -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- -                             unsigned int nr_pages);
- -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
- -                                unsigned int nr_pages);
+ bool mem_cgroup_kmem_disabled(void)
+ {
+       return cgroup_memory_nokmem;
+ }
+ 
+ +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ +                                    unsigned int nr_pages);
   
   static void obj_cgroup_release(struct percpu_ref *ref)
   {
         struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
- -      struct mem_cgroup *memcg;
         unsigned int nr_bytes;
         unsigned int nr_pages;
         unsigned long flags;
@@@ -290,11 -297,12 +295,11 @@@
         WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
         nr_pages = nr_bytes >> PAGE_SHIFT;
   
- -      spin_lock_irqsave(&css_set_lock, flags);
- -      memcg = obj_cgroup_memcg(objcg);
         if (nr_pages)
- -              __memcg_kmem_uncharge(memcg, nr_pages);
+ +              obj_cgroup_uncharge_pages(objcg, nr_pages);
+ +
+ +      spin_lock_irqsave(&css_set_lock, flags);
         list_del(&objcg->list);
- -      mem_cgroup_put(memcg);
         spin_unlock_irqrestore(&css_set_lock, flags);
   
         percpu_ref_exit(ref);
@@@ -329,12 -337,17 +334,12 @@@ static void memcg_reparent_objcgs(struc
   
         spin_lock_irq(&css_set_lock);
   
- -      /* Move active objcg to the parent's list */
- -      xchg(&objcg->memcg, parent);
- -      css_get(&parent->css);
- -      list_add(&objcg->list, &parent->objcg_list);
- -
- -      /* Move already reparented objcgs to the parent's list */
- -      list_for_each_entry(iter, &memcg->objcg_list, list) {
- -              css_get(&parent->css);
- -              xchg(&iter->memcg, parent);
- -              css_put(&memcg->css);
- -      }
+ +      /* 1) Ready to reparent active objcg. */
+ +      list_add(&objcg->list, &memcg->objcg_list);
+ +      /* 2) Reparent active objcg and already reparented objcgs to parent. */
+ +      list_for_each_entry(iter, &memcg->objcg_list, list)
+ +              WRITE_ONCE(iter->memcg, parent);
+ +      /* 3) Move already reparented objcgs to the parent's list */
         list_splice(&memcg->objcg_list, &parent->objcg_list);
   
         spin_unlock_irq(&css_set_lock);
@@@ -394,6 -407,129 +399,6 @@@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enab
   EXPORT_SYMBOL(memcg_kmem_enabled_key);
   #endif
   
- -static int memcg_shrinker_map_size;
- -static DEFINE_MUTEX(memcg_shrinker_map_mutex);
- -
- -static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
- -{
- -      kvfree(container_of(head, struct memcg_shrinker_map, rcu));
- -}
- -
- -static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
- -                                       int size, int old_size)
- -{
- -      struct memcg_shrinker_map *new, *old;
- -      int nid;
- -
- -      lockdep_assert_held(&memcg_shrinker_map_mutex);
- -
- -      for_each_node(nid) {
- -              old = rcu_dereference_protected(
- -                      mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
- -              /* Not yet online memcg */
- -              if (!old)
- -                      return 0;
- -
- -              new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
- -              if (!new)
- -                      return -ENOMEM;
- -
- -              /* Set all old bits, clear all new bits */
- -              memset(new->map, (int)0xff, old_size);
- -              memset((void *)new->map + old_size, 0, size - old_size);
- -
- -              rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
- -              call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
- -      }
- -
- -      return 0;
- -}
- -
- -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
- -{
- -      struct mem_cgroup_per_node *pn;
- -      struct memcg_shrinker_map *map;
- -      int nid;
- -
- -      if (mem_cgroup_is_root(memcg))
- -              return;
- -
- -      for_each_node(nid) {
- -              pn = mem_cgroup_nodeinfo(memcg, nid);
- -              map = rcu_dereference_protected(pn->shrinker_map, true);
- -              kvfree(map);
- -              rcu_assign_pointer(pn->shrinker_map, NULL);
- -      }
- -}
- -
- -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
- -{
- -      struct memcg_shrinker_map *map;
- -      int nid, size, ret = 0;
- -
- -      if (mem_cgroup_is_root(memcg))
- -              return 0;
- -
- -      mutex_lock(&memcg_shrinker_map_mutex);
- -      size = memcg_shrinker_map_size;
- -      for_each_node(nid) {
- -              map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
- -              if (!map) {
- -                      memcg_free_shrinker_maps(memcg);
- -                      ret = -ENOMEM;
- -                      break;
- -              }
- -              rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
- -      }
- -      mutex_unlock(&memcg_shrinker_map_mutex);
- -
- -      return ret;
- -}
- -
- -int memcg_expand_shrinker_maps(int new_id)
- -{
- -      int size, old_size, ret = 0;
- -      struct mem_cgroup *memcg;
- -
- -      size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
- -      old_size = memcg_shrinker_map_size;
- -      if (size <= old_size)
- -              return 0;
- -
- -      mutex_lock(&memcg_shrinker_map_mutex);
- -      if (!root_mem_cgroup)
- -              goto unlock;
- -
- -      for_each_mem_cgroup(memcg) {
- -              if (mem_cgroup_is_root(memcg))
- -                      continue;
- -              ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- -              if (ret) {
- -                      mem_cgroup_iter_break(NULL, memcg);
- -                      goto unlock;
- -              }
- -      }
- -unlock:
- -      if (!ret)
- -              memcg_shrinker_map_size = size;
- -      mutex_unlock(&memcg_shrinker_map_mutex);
- -      return ret;
- -}
- -
- -void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
- -{
- -      if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
- -              struct memcg_shrinker_map *map;
- -
- -              rcu_read_lock();
- -              map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
- -              /* Pairs with smp mb in shrink_slab() */
- -              smp_mb__before_atomic();
- -              set_bit(shrinker_id, map->map);
- -              rcu_read_unlock();
- -      }
- -}
- -
   /**
    * mem_cgroup_css_from_page - css of the memcg associated with a page
    * @page: page of interest
@@@ -582,7 -718,7 +587,7 @@@ static void mem_cgroup_remove_from_tree
         int nid;
   
         for_each_node(nid) {
- -              mz = mem_cgroup_nodeinfo(memcg, nid);
+ +              mz = memcg->nodeinfo[nid];
                 mctz = soft_limit_tree_node(nid);
                 if (mctz)
                         mem_cgroup_remove_exceeded(mz, mctz);
@@@ -633,37 -769,28 +638,37 @@@ mem_cgroup_largest_soft_limit_node(stru
    */
   void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
   {
- -      long x, threshold = MEMCG_CHARGE_BATCH;
- -
         if (mem_cgroup_disabled())
                 return;
   
- -      if (memcg_stat_item_in_bytes(idx))
- -              threshold <<= PAGE_SHIFT;
+ +      __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ +      cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+ +}
   
- -      x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- -      if (unlikely(abs(x) > threshold)) {
- -              struct mem_cgroup *mi;
+ +/* idx can be of type enum memcg_stat_item or node_stat_item. */
+ +static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+ +{
+ +      long x = READ_ONCE(memcg->vmstats.state[idx]);
+ +#ifdef CONFIG_SMP
+ +      if (x < 0)
+ +              x = 0;
+ +#endif
+ +      return x;
+ +}
   
- -              /*
- -               * Batch local counters to keep them in sync with
- -               * the hierarchical ones.
- -               */
- -              __this_cpu_add(memcg->vmstats_local->stat[idx], x);
- -              for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- -                      atomic_long_add(x, &mi->vmstats[idx]);
+ +/* idx can be of type enum memcg_stat_item or node_stat_item. */
+ +static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
+ +{
+ +      long x = 0;
+ +      int cpu;
+ +
+ +      for_each_possible_cpu(cpu)
+ +              x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
+ +#ifdef CONFIG_SMP
+ +      if (x < 0)
                 x = 0;
- -      }
- -      __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+ +#endif
+ +      return x;
   }
   
   static struct mem_cgroup_per_node *
@@@ -674,7 -801,7 +679,7 @@@ parent_nodeinfo(struct mem_cgroup_per_n
         parent = parent_mem_cgroup(pn->memcg);
         if (!parent)
                 return NULL;
- -      return mem_cgroup_nodeinfo(parent, nid);
+ +      return parent->nodeinfo[nid];
   }
   
   void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@@ -733,22 -860,18 +738,22 @@@ void __mod_lruvec_page_state(struct pag
                              int val)
   {
         struct page *head = compound_head(page); /* rmap on tail pages */
- -      struct mem_cgroup *memcg = page_memcg(head);
+ +      struct mem_cgroup *memcg;
         pg_data_t *pgdat = page_pgdat(page);
         struct lruvec *lruvec;
   
+ +      rcu_read_lock();
+ +      memcg = page_memcg(head);
         /* Untracked pages have no memcg, no lruvec. Update only the node */
         if (!memcg) {
+ +              rcu_read_unlock();
                 __mod_node_page_state(pgdat, idx, val);
                 return;
         }
   
         lruvec = mem_cgroup_lruvec(memcg, pgdat);
         __mod_lruvec_state(lruvec, idx, val);
+ +      rcu_read_unlock();
   }
   EXPORT_SYMBOL(__mod_lruvec_page_state);
   
@@@ -776,43 -899,39 +781,43 @@@ void __mod_lruvec_kmem_state(void *p, e
         rcu_read_unlock();
   }
   
+ +/*
+ + * mod_objcg_mlstate() may be called with irq enabled, so
+ + * mod_memcg_lruvec_state() should be used.
+ + */
+ +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
+ +                                   struct pglist_data *pgdat,
+ +                                   enum node_stat_item idx, int nr)
+ +{
+ +      struct mem_cgroup *memcg;
+ +      struct lruvec *lruvec;
+ +
+ +      rcu_read_lock();
+ +      memcg = obj_cgroup_memcg(objcg);
+ +      lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ +      mod_memcg_lruvec_state(lruvec, idx, nr);
+ +      rcu_read_unlock();
+ +}
+ +
   /**
    * __count_memcg_events - account VM events in a cgroup
    * @memcg: the memory cgroup
    * @idx: the event item
- - * @count: the number of events that occured
+ + * @count: the number of events that occurred
    */
   void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                           unsigned long count)
   {
- -      unsigned long x;
- -
         if (mem_cgroup_disabled())
                 return;
   
- -      x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
- -      if (unlikely(x > MEMCG_CHARGE_BATCH)) {
- -              struct mem_cgroup *mi;
- -
- -              /*
- -               * Batch local counters to keep them in sync with
- -               * the hierarchical ones.
- -               */
- -              __this_cpu_add(memcg->vmstats_local->events[idx], x);
- -              for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- -                      atomic_long_add(x, &mi->vmevents[idx]);
- -              x = 0;
- -      }
- -      __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+ +      __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ +      cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
   }
   
   static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
   {
- -      return atomic_long_read(&memcg->vmevents[event]);
+ +      return READ_ONCE(memcg->vmstats.events[event]);
   }
   
   static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
@@@ -821,7 -940,7 +826,7 @@@
         int cpu;
   
         for_each_possible_cpu(cpu)
- -              x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ +              x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
         return x;
   }
   
@@@ -898,24 -1017,13 +903,24 @@@ struct mem_cgroup *mem_cgroup_from_task
   }
   EXPORT_SYMBOL(mem_cgroup_from_task);
   
+ +static __always_inline struct mem_cgroup *active_memcg(void)
+ +{
+ +      if (in_interrupt())
+ +              return this_cpu_read(int_active_memcg);
+ +      else
+ +              return current->active_memcg;
+ +}
+ +
   /**
    * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
    * @mm: mm from which memcg should be extracted. It can be NULL.
    *
- - * Obtain a reference on mm->memcg and returns it if successful. Otherwise
- - * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
- - * returned.
+ + * Obtain a reference on mm->memcg and returns it if successful. If mm
+ + * is NULL, then the memcg is chosen as follows:
+ + * 1) The active memcg, if set.
+ + * 2) current->mm->memcg, if available
+ + * 3) root memcg
+ + * If mem_cgroup is disabled, NULL is returned.
    */
   struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
   {
@@@ -924,38 -1032,48 +929,38 @@@
         if (mem_cgroup_disabled())
                 return NULL;
   
+ +      /*
+ +       * Page cache insertions can happen without an
+ +       * actual mm context, e.g. during disk probing
+ +       * on boot, loopback IO, acct() writes etc.
+ +       *
+ +       * No need to css_get on root memcg as the reference
+ +       * counting is disabled on the root level in the
+ +       * cgroup core. See CSS_NO_REF.
+ +       */
+ +      if (unlikely(!mm)) {
+ +              memcg = active_memcg();
+ +              if (unlikely(memcg)) {
+ +                      /* remote memcg must hold a ref */
+ +                      css_get(&memcg->css);
+ +                      return memcg;
+ +              }
+ +              mm = current->mm;
+ +              if (unlikely(!mm))
+ +                      return root_mem_cgroup;
+ +      }
+ +
         rcu_read_lock();
         do {
- -              /*
- -               * Page cache insertions can happen withou an
- -               * actual mm context, e.g. during disk probing
- -               * on boot, loopback IO, acct() writes etc.
- -               */
- -              if (unlikely(!mm))
+ +              memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ +              if (unlikely(!memcg))
                         memcg = root_mem_cgroup;
- -              else {
- -                      memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- -                      if (unlikely(!memcg))
- -                              memcg = root_mem_cgroup;
- -              }
         } while (!css_tryget(&memcg->css));
         rcu_read_unlock();
         return memcg;
   }
   EXPORT_SYMBOL(get_mem_cgroup_from_mm);
   
- -static __always_inline struct mem_cgroup *active_memcg(void)
- -{
- -      if (in_interrupt())
- -              return this_cpu_read(int_active_memcg);
- -      else
- -              return current->active_memcg;
- -}
- -
- -static __always_inline struct mem_cgroup *get_active_memcg(void)
- -{
- -      struct mem_cgroup *memcg;
- -
- -      rcu_read_lock();
- -      memcg = active_memcg();
- -      /* remote memcg must hold a ref. */
- -      if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
- -              memcg = root_mem_cgroup;
- -      rcu_read_unlock();
- -
- -      return memcg;
- -}
- -
   static __always_inline bool memcg_kmem_bypass(void)
   {
         /* Allow remote memcg charging from any context. */
@@@ -969,6 -1087,20 +974,6 @@@
         return false;
   }
   
- -/**
- - * If active memcg is set, do not fallback to current->mm->memcg.
- - */
- -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
- -{
- -      if (memcg_kmem_bypass())
- -              return NULL;
- -
- -      if (unlikely(active_memcg()))
- -              return get_active_memcg();
- -
- -      return get_mem_cgroup_from_mm(current->mm);
- -}
- -
   /**
    * mem_cgroup_iter - iterate over memory cgroup hierarchy
    * @root: hierarchy root
@@@ -1009,7 -1141,7 +1014,7 @@@ struct mem_cgroup *mem_cgroup_iter(stru
         if (reclaim) {
                 struct mem_cgroup_per_node *mz;
   
- -              mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+ +              mz = root->nodeinfo[reclaim->pgdat->node_id];
                 iter = &mz->iter;
   
                 if (prev && reclaim->generation != iter->generation)
@@@ -1111,7 -1243,7 +1116,7 @@@ static void __invalidate_reclaim_iterat
         int nid;
   
         for_each_node(nid) {
- -              mz = mem_cgroup_nodeinfo(from, nid);
+ +              mz = from->nodeinfo[nid];
                 iter = &mz->iter;
                 cmpxchg(&iter->position, dead_memcg, NULL);
         }
@@@ -1205,8 -1337,9 +1210,8 @@@ void lruvec_memcg_debug(struct lruvec *
   struct lruvec *lock_page_lruvec(struct page *page)
   {
         struct lruvec *lruvec;
- -      struct pglist_data *pgdat = page_pgdat(page);
   
- -      lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ +      lruvec = mem_cgroup_page_lruvec(page);
         spin_lock(&lruvec->lru_lock);
   
         lruvec_memcg_debug(lruvec, page);
@@@ -1217,8 -1350,9 +1222,8 @@@
   struct lruvec *lock_page_lruvec_irq(struct page *page)
   {
         struct lruvec *lruvec;
- -      struct pglist_data *pgdat = page_pgdat(page);
   
- -      lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ +      lruvec = mem_cgroup_page_lruvec(page);
         spin_lock_irq(&lruvec->lru_lock);
   
         lruvec_memcg_debug(lruvec, page);
@@@ -1229,8 -1363,9 +1234,8 @@@
   struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
   {
         struct lruvec *lruvec;
- -      struct pglist_data *pgdat = page_pgdat(page);
   
- -      lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ +      lruvec = mem_cgroup_page_lruvec(page);
         spin_lock_irqsave(&lruvec->lru_lock, *flags);
   
         lruvec_memcg_debug(lruvec, page);
@@@ -1441,7 -1576,6 +1446,7 @@@ static char *memory_stat_format(struct 
          *
          * Current memory state:
          */
+ +      cgroup_rstat_flush(memcg->css.cgroup);
   
         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                 u64 size;
@@@ -1736,7 -1870,7 +1741,7 @@@ static void mem_cgroup_unmark_under_oom
         struct mem_cgroup *iter;
   
         /*
- -       * Be careful about under_oom underflows becase a child memcg
+ +       * Be careful about under_oom underflows because a child memcg
          * could have been added after mem_cgroup_mark_under_oom.
          */
         spin_lock(&memcg_oom_lock);
@@@ -1908,7 -2042,7 +1913,7 @@@ bool mem_cgroup_oom_synchronize(bool ha
                 /*
                  * There is no guarantee that an OOM-lock contender
                  * sees the wakeups triggered by the OOM kill
- -               * uncharges.  Wake any sleepers explicitely.
+ +               * uncharges.  Wake any sleepers explicitly.
                  */
                 memcg_oom_recover(memcg);
         }
@@@ -1989,10 -2123,11 +1994,10 @@@ void mem_cgroup_print_oom_group(struct 
    * This function protects unlocked LRU pages from being moved to
    * another cgroup.
    *
- - * It ensures lifetime of the returned memcg. Caller is responsible
- - * for the lifetime of the page; __unlock_page_memcg() is available
- - * when @page might get freed inside the locked section.
+ + * It ensures lifetime of the locked memcg. Caller is responsible
+ + * for the lifetime of the page.
    */
- -struct mem_cgroup *lock_page_memcg(struct page *page)
+ +void lock_page_memcg(struct page *page)
   {
         struct page *head = compound_head(page); /* rmap on tail pages */
         struct mem_cgroup *memcg;
@@@ -2002,15 -2137,21 +2007,15 @@@
          * The RCU lock is held throughout the transaction.  The fast
          * path can get away without acquiring the memcg->move_lock
          * because page moving starts with an RCU grace period.
- -       *
- -       * The RCU lock also protects the memcg from being freed when
- -       * the page state that is going to change is the only thing
- -       * preventing the page itself from being freed. E.g. writeback
- -       * doesn't hold a page reference and relies on PG_writeback to
- -       * keep off truncation, migration and so forth.
            */
         rcu_read_lock();
   
         if (mem_cgroup_disabled())
- -              return NULL;
+ +              return;
   again:
         memcg = page_memcg(head);
         if (unlikely(!memcg))
- -              return NULL;
+ +              return;
   
   #ifdef CONFIG_PROVE_LOCKING
         local_irq_save(flags);
@@@ -2019,7 -2160,7 +2024,7 @@@
   #endif
   
         if (atomic_read(&memcg->moving_account) <= 0)
- -              return memcg;
+ +              return;
   
         spin_lock_irqsave(&memcg->move_lock, flags);
         if (memcg != page_memcg(head)) {
@@@ -2028,17 -2169,24 +2033,17 @@@
         }
   
         /*
- -       * When charge migration first begins, we can have locked and
- -       * unlocked page stat updates happening concurrently.  Track
- -       * the task who has the lock for unlock_page_memcg().
+ +       * When charge migration first begins, we can have multiple
+ +       * critical sections holding the fast-path RCU lock and one
+ +       * holding the slowpath move_lock. Track the task who has the
+ +       * move_lock for unlock_page_memcg().
          */
         memcg->move_lock_task = current;
         memcg->move_lock_flags = flags;
- -
- -      return memcg;
   }
   EXPORT_SYMBOL(lock_page_memcg);
   
- -/**
- - * __unlock_page_memcg - unlock and unpin a memcg
- - * @memcg: the memcg
- - *
- - * Unlock and unpin a memcg returned by lock_page_memcg().
- - */
- -void __unlock_page_memcg(struct mem_cgroup *memcg)
+ +static void __unlock_page_memcg(struct mem_cgroup *memcg)
   {
         if (memcg && memcg->move_lock_task == current) {
                 unsigned long flags = memcg->move_lock_flags;
@@@ -2064,23 -2212,14 +2069,23 @@@ void unlock_page_memcg(struct page *pag
   }
   EXPORT_SYMBOL(unlock_page_memcg);
   
- -struct memcg_stock_pcp {
- -      struct mem_cgroup *cached; /* this never be root cgroup */
- -      unsigned int nr_pages;
- -
+ +struct obj_stock {
   #ifdef CONFIG_MEMCG_KMEM
         struct obj_cgroup *cached_objcg;
+ +      struct pglist_data *cached_pgdat;
         unsigned int nr_bytes;
+ +      int nr_slab_reclaimable_b;
+ +      int nr_slab_unreclaimable_b;
+ +#else
+ +      int dummy[0];
   #endif
+ +};
+ +
+ +struct memcg_stock_pcp {
+ +      struct mem_cgroup *cached; /* this never be root cgroup */
+ +      unsigned int nr_pages;
+ +      struct obj_stock task_obj;
+ +      struct obj_stock irq_obj;
   
         struct work_struct work;
         unsigned long flags;
@@@ -2090,12 -2229,12 +2095,12 @@@ static DEFINE_PER_CPU(struct memcg_stoc
   static DEFINE_MUTEX(percpu_charge_mutex);
   
   #ifdef CONFIG_MEMCG_KMEM
- -static void drain_obj_stock(struct memcg_stock_pcp *stock);
+ +static void drain_obj_stock(struct obj_stock *stock);
   static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                      struct mem_cgroup *root_memcg);
   
   #else
- -static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+ +static inline void drain_obj_stock(struct obj_stock *stock)
   {
   }
   static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@@ -2105,41 -2244,6 +2110,41 @@@
   }
   #endif
   
+ +/*
+ + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
+ + * sequence used in this case to access content from object stock is slow.
+ + * To optimize for user context access, there are now two object stocks for
+ + * task context and interrupt context access respectively.
+ + *
+ + * The task context object stock can be accessed by disabling preemption only
+ + * which is cheap in non-preempt kernel. The interrupt context object stock
+ + * can only be accessed after disabling interrupt. User context code can
+ + * access interrupt object stock, but not vice versa.
+ + */
+ +static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
+ +{
+ +      struct memcg_stock_pcp *stock;
+ +
+ +      if (likely(in_task())) {
+ +              *pflags = 0UL;
+ +              preempt_disable();
+ +              stock = this_cpu_ptr(&memcg_stock);
+ +              return &stock->task_obj;
+ +      }
+ +
+ +      local_irq_save(*pflags);
+ +      stock = this_cpu_ptr(&memcg_stock);
+ +      return &stock->irq_obj;
+ +}
+ +
+ +static inline void put_obj_stock(unsigned long flags)
+ +{
+ +      if (likely(in_task()))
+ +              preempt_enable();
+ +      else
+ +              local_irq_restore(flags);
+ +}
+ +
   /**
    * consume_stock: Try to consume stocked charge on this cpu.
    * @memcg: memcg to consume from.
@@@ -2206,9 -2310,7 +2211,9 @@@ static void drain_local_stock(struct wo
         local_irq_save(flags);
   
         stock = this_cpu_ptr(&memcg_stock);
- -      drain_obj_stock(stock);
+ +      drain_obj_stock(&stock->irq_obj);
+ +      if (in_task())
+ +              drain_obj_stock(&stock->task_obj);
         drain_stock(stock);
         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
   
@@@ -2284,39 -2386,50 +2289,39 @@@ static void drain_all_stock(struct mem_
         mutex_unlock(&percpu_charge_mutex);
   }
   
- -static int memcg_hotplug_cpu_dead(unsigned int cpu)
+ +static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
   {
- -      struct memcg_stock_pcp *stock;
- -      struct mem_cgroup *memcg, *mi;
- -
- -      stock = &per_cpu(memcg_stock, cpu);
- -      drain_stock(stock);
+ +      int nid;
   
- -      for_each_mem_cgroup(memcg) {
+ +      for_each_node(nid) {
+ +              struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ +              unsigned long stat[NR_VM_NODE_STAT_ITEMS];
+ +              struct batched_lruvec_stat *lstatc;
                 int i;
   
- -              for (i = 0; i < MEMCG_NR_STAT; i++) {
- -                      int nid;
- -                      long x;
- -
- -                      x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
- -                      if (x)
- -                              for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- -                                      atomic_long_add(x, &memcg->vmstats[i]);
- -
- -                      if (i >= NR_VM_NODE_STAT_ITEMS)
- -                              continue;
+ +              lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
+ +              for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ +                      stat[i] = lstatc->count[i];
+ +                      lstatc->count[i] = 0;
+ +              }
   
- -                      for_each_node(nid) {
- -                              struct mem_cgroup_per_node *pn;
+ +              do {
+ +                      for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ +                              atomic_long_add(stat[i], &pn->lruvec_stat[i]);
+ +              } while ((pn = parent_nodeinfo(pn, nid)));
+ +      }
+ +}
   
- -                              pn = mem_cgroup_nodeinfo(memcg, nid);
- -                              x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
- -                              if (x)
- -                                      do {
- -                                              atomic_long_add(x, &pn->lruvec_stat[i]);
- -                                      } while ((pn = parent_nodeinfo(pn, nid)));
- -                      }
- -              }
+ +static int memcg_hotplug_cpu_dead(unsigned int cpu)
+ +{
+ +      struct memcg_stock_pcp *stock;
+ +      struct mem_cgroup *memcg;
   
- -              for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- -                      long x;
+ +      stock = &per_cpu(memcg_stock, cpu);
+ +      drain_stock(stock);
   
- -                      x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
- -                      if (x)
- -                              for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- -                                      atomic_long_add(x, &memcg->vmevents[i]);
- -              }
- -      }
+ +      for_each_mem_cgroup(memcg)
+ +              memcg_flush_lruvec_page_state(memcg, cpu);
   
         return 0;
   }
@@@ -2574,8 -2687,8 +2579,8 @@@ out
         css_put(&memcg->css);
   }
   
- -static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- -                    unsigned int nr_pages)
+ +static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ +                      unsigned int nr_pages)
   {
         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
         int nr_retries = MAX_RECLAIM_RETRIES;
@@@ -2587,6 -2700,8 +2592,6 @@@
         bool drained = false;
         unsigned long pflags;
   
- -      if (mem_cgroup_is_root(memcg))
- -              return 0;
   retry:
         if (consume_stock(memcg, nr_pages))
                 return 0;
@@@ -2683,6 -2798,9 +2688,6 @@@
         if (gfp_mask & __GFP_RETRY_MAYFAIL)
                 goto nomem;
   
- -      if (gfp_mask & __GFP_NOFAIL)
- -              goto force;
- -
         if (fatal_signal_pending(current))
                 goto force;
   
@@@ -2766,15 -2884,6 +2771,15 @@@ done_restock
         return 0;
   }
   
+ +static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ +                           unsigned int nr_pages)
+ +{
+ +      if (mem_cgroup_is_root(memcg))
+ +              return 0;
+ +
+ +      return try_charge_memcg(memcg, gfp_mask, nr_pages);
+ +}
+ +
   #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
   static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
   {
@@@ -2801,28 -2910,7 +2806,28 @@@ static void commit_charge(struct page *
         page->memcg_data = (unsigned long)memcg;
   }
   
+ +static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+ +{
+ +      struct mem_cgroup *memcg;
+ +
+ +      rcu_read_lock();
+ +retry:
+ +      memcg = obj_cgroup_memcg(objcg);
+ +      if (unlikely(!css_tryget(&memcg->css)))
+ +              goto retry;
+ +      rcu_read_unlock();
+ +
+ +      return memcg;
+ +}
+ +
   #ifdef CONFIG_MEMCG_KMEM
+ +/*
+ + * The allocated objcg pointers array is not accounted directly.
+ + * Moreover, it should not come from DMA buffer and is not readily
+ + * reclaimable. So those GFP bits should be masked off.
+ + */
+ +#define OBJCGS_CLEAR_MASK     (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
+ +
   int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
                                  gfp_t gfp, bool new_page)
   {
@@@ -2830,7 -2918,6 +2835,7 @@@
         unsigned long memcg_data;
         void *vec;
   
+ +      gfp &= ~OBJCGS_CLEAR_MASK;
         vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
                            page_to_nid(page));
         if (!vec)
@@@ -2969,50 -3056,28 +2974,50 @@@ static int memcg_alloc_cache_id(void
         return id;
   }
   
- -static void memcg_free_cache_id(int id)
- -{
- -      ida_simple_remove(&memcg_cache_ida, id);
+ +static void memcg_free_cache_id(int id)
+ +{
+ +      ida_simple_remove(&memcg_cache_ida, id);
+ +}
+ +
+ +/*
+ + * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
+ + * @objcg: object cgroup to uncharge
+ + * @nr_pages: number of pages to uncharge
+ + */
+ +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ +                                    unsigned int nr_pages)
+ +{
+ +      struct mem_cgroup *memcg;
+ +
+ +      memcg = get_mem_cgroup_from_objcg(objcg);
+ +
+ +      if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ +              page_counter_uncharge(&memcg->kmem, nr_pages);
+ +      refill_stock(memcg, nr_pages);
+ +
+ +      css_put(&memcg->css);
   }
   
- -/**
- - * __memcg_kmem_charge: charge a number of kernel pages to a memcg
- - * @memcg: memory cgroup to charge
+ +/*
+ + * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
+ + * @objcg: object cgroup to charge
    * @gfp: reclaim mode
    * @nr_pages: number of pages to charge
    *
    * Returns 0 on success, an error code on failure.
    */
- -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- -                             unsigned int nr_pages)
+ +static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
+ +                                 unsigned int nr_pages)
   {
         struct page_counter *counter;
+ +      struct mem_cgroup *memcg;
         int ret;
   
- -      ret = try_charge(memcg, gfp, nr_pages);
+ +      memcg = get_mem_cgroup_from_objcg(objcg);
+ +
+ +      ret = try_charge_memcg(memcg, gfp, nr_pages);
         if (ret)
- -              return ret;
+ +              goto out;
   
         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
             !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
@@@ -3024,15 -3089,25 +3029,15 @@@
                  */
                 if (gfp & __GFP_NOFAIL) {
                         page_counter_charge(&memcg->kmem, nr_pages);
- -                      return 0;
+ +                      goto out;
                 }
                 cancel_charge(memcg, nr_pages);
- -              return -ENOMEM;
+ +              ret = -ENOMEM;
         }
- -      return 0;
- -}
- -
- -/**
- - * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
- - * @memcg: memcg to uncharge
- - * @nr_pages: number of pages to uncharge
- - */
- -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
- -{
- -      if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- -              page_counter_uncharge(&memcg->kmem, nr_pages);
+ +out:
+ +      css_put(&memcg->css);
   
- -      refill_stock(memcg, nr_pages);
+ +      return ret;
   }
   
   /**
@@@ -3045,18 -3120,18 +3050,18 @@@
    */
   int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
   {
- -      struct mem_cgroup *memcg;
+ +      struct obj_cgroup *objcg;
         int ret = 0;
   
- -      memcg = get_mem_cgroup_from_current();
- -      if (memcg && !mem_cgroup_is_root(memcg)) {
- -              ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ +      objcg = get_obj_cgroup_from_current();
+ +      if (objcg) {
+ +              ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
                 if (!ret) {
- -                      page->memcg_data = (unsigned long)memcg |
+ +                      page->memcg_data = (unsigned long)objcg |
                                 MEMCG_DATA_KMEM;
                         return 0;
                 }
- -              css_put(&memcg->css);
+ +              obj_cgroup_put(objcg);
         }
         return ret;
   }
@@@ -3068,93 -3143,38 +3073,93 @@@
    */
   void __memcg_kmem_uncharge_page(struct page *page, int order)
   {
- -      struct mem_cgroup *memcg = page_memcg(page);
+ +      struct obj_cgroup *objcg;
         unsigned int nr_pages = 1 << order;
   
- -      if (!memcg)
+ +      if (!PageMemcgKmem(page))
                 return;
   
- -      VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- -      __memcg_kmem_uncharge(memcg, nr_pages);
+ +      objcg = __page_objcg(page);
+ +      obj_cgroup_uncharge_pages(objcg, nr_pages);
         page->memcg_data = 0;
- -      css_put(&memcg->css);
+ +      obj_cgroup_put(objcg);
+ +}
+ +
+ +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ +                   enum node_stat_item idx, int nr)
+ +{
+ +      unsigned long flags;
+ +      struct obj_stock *stock = get_obj_stock(&flags);
+ +      int *bytes;
+ +
+ +      /*
+ +       * Save vmstat data in stock and skip vmstat array update unless
+ +       * accumulating over a page of vmstat data or when pgdat or idx
+ +       * changes.
+ +       */
+ +      if (stock->cached_objcg != objcg) {
+ +              drain_obj_stock(stock);
+ +              obj_cgroup_get(objcg);
+ +              stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ +                              ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ +              stock->cached_objcg = objcg;
+ +              stock->cached_pgdat = pgdat;
+ +      } else if (stock->cached_pgdat != pgdat) {
+ +              /* Flush the existing cached vmstat data */
+ +              if (stock->nr_slab_reclaimable_b) {
+ +                      mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
+ +                                        stock->nr_slab_reclaimable_b);
+ +                      stock->nr_slab_reclaimable_b = 0;
+ +              }
+ +              if (stock->nr_slab_unreclaimable_b) {
+ +                      mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
+ +                                        stock->nr_slab_unreclaimable_b);
+ +                      stock->nr_slab_unreclaimable_b = 0;
+ +              }
+ +              stock->cached_pgdat = pgdat;
+ +      }
+ +
+ +      bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
+ +                                             : &stock->nr_slab_unreclaimable_b;
+ +      /*
+ +       * Even for large object >= PAGE_SIZE, the vmstat data will still be
+ +       * cached locally at least once before pushing it out.
+ +       */
+ +      if (!*bytes) {
+ +              *bytes = nr;
+ +              nr = 0;
+ +      } else {
+ +              *bytes += nr;
+ +              if (abs(*bytes) > PAGE_SIZE) {
+ +                      nr = *bytes;
+ +                      *bytes = 0;
+ +              } else {
+ +                      nr = 0;
+ +              }
+ +      }
+ +      if (nr)
+ +              mod_objcg_mlstate(objcg, pgdat, idx, nr);
+ +
+ +      put_obj_stock(flags);
   }
   
   static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
   {
- -      struct memcg_stock_pcp *stock;
         unsigned long flags;
+ +      struct obj_stock *stock = get_obj_stock(&flags);
         bool ret = false;
   
- -      local_irq_save(flags);
- -
- -      stock = this_cpu_ptr(&memcg_stock);
         if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
                 stock->nr_bytes -= nr_bytes;
                 ret = true;
         }
   
- -      local_irq_restore(flags);
+ +      put_obj_stock(flags);
   
         return ret;
   }
   
- -static void drain_obj_stock(struct memcg_stock_pcp *stock)
+ +static void drain_obj_stock(struct obj_stock *stock)
   {
         struct obj_cgroup *old = stock->cached_objcg;
   
@@@ -3165,8 -3185,11 +3170,8 @@@
                 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
                 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
   
- -              if (nr_pages) {
- -                      rcu_read_lock();
- -                      __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
- -                      rcu_read_unlock();
- -              }
+ +              if (nr_pages)
+ +                      obj_cgroup_uncharge_pages(old, nr_pages);
   
                 /*
                  * The leftover is flushed to the centralized per-memcg value.
@@@ -3182,25 -3205,6 +3187,25 @@@
                 stock->nr_bytes = 0;
         }
   
+ +      /*
+ +       * Flush the vmstat data in current stock
+ +       */
+ +      if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
+ +              if (stock->nr_slab_reclaimable_b) {
+ +                      mod_objcg_mlstate(old, stock->cached_pgdat,
+ +                                        NR_SLAB_RECLAIMABLE_B,
+ +                                        stock->nr_slab_reclaimable_b);
+ +                      stock->nr_slab_reclaimable_b = 0;
+ +              }
+ +              if (stock->nr_slab_unreclaimable_b) {
+ +                      mod_objcg_mlstate(old, stock->cached_pgdat,
+ +                                        NR_SLAB_UNRECLAIMABLE_B,
+ +                                        stock->nr_slab_unreclaimable_b);
+ +                      stock->nr_slab_unreclaimable_b = 0;
+ +              }
+ +              stock->cached_pgdat = NULL;
+ +      }
+ +
         obj_cgroup_put(old);
         stock->cached_objcg = NULL;
   }
@@@ -3210,13 -3214,8 +3215,13 @@@ static bool obj_stock_flush_required(st
   {
         struct mem_cgroup *memcg;
   
- -      if (stock->cached_objcg) {
- -              memcg = obj_cgroup_memcg(stock->cached_objcg);
+ +      if (in_task() && stock->task_obj.cached_objcg) {
+ +              memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
+ +              if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ +                      return true;
+ +      }
+ +      if (stock->irq_obj.cached_objcg) {
+ +              memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
                 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
                         return true;
         }
@@@ -3224,36 -3223,31 +3229,36 @@@
         return false;
   }
   
- -static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+ +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ +                           bool allow_uncharge)
   {
- -      struct memcg_stock_pcp *stock;
         unsigned long flags;
+ +      struct obj_stock *stock = get_obj_stock(&flags);
+ +      unsigned int nr_pages = 0;
   
- -      local_irq_save(flags);
- -
- -      stock = this_cpu_ptr(&memcg_stock);
         if (stock->cached_objcg != objcg) { /* reset if necessary */
                 drain_obj_stock(stock);
                 obj_cgroup_get(objcg);
                 stock->cached_objcg = objcg;
- -              stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ +              stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ +                              ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ +              allow_uncharge = true;  /* Allow uncharge when objcg changes */
         }
         stock->nr_bytes += nr_bytes;
   
- -      if (stock->nr_bytes > PAGE_SIZE)
- -              drain_obj_stock(stock);
+ +      if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
+ +              nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ +              stock->nr_bytes &= (PAGE_SIZE - 1);
+ +      }
   
- -      local_irq_restore(flags);
+ +      put_obj_stock(flags);
+ +
+ +      if (nr_pages)
+ +              obj_cgroup_uncharge_pages(objcg, nr_pages);
   }
   
   int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
   {
- -      struct mem_cgroup *memcg;
         unsigned int nr_pages, nr_bytes;
         int ret;
   
@@@ -3261,44 -3255,39 +3266,44 @@@
                 return 0;
   
         /*
- -       * In theory, memcg->nr_charged_bytes can have enough
+ +       * In theory, objcg->nr_charged_bytes can have enough
          * pre-charged bytes to satisfy the allocation. However,
- -       * flushing memcg->nr_charged_bytes requires two atomic
- -       * operations, and memcg->nr_charged_bytes can't be big,
- -       * so it's better to ignore it and try grab some new pages.
- -       * memcg->nr_charged_bytes will be flushed in
- -       * refill_obj_stock(), called from this function or
- -       * independently later.
+ +       * flushing objcg->nr_charged_bytes requires two atomic
+ +       * operations, and objcg->nr_charged_bytes can't be big.
+ +       * The shared objcg->nr_charged_bytes can also become a
+ +       * performance bottleneck if all tasks of the same memcg are
+ +       * trying to update it. So it's better to ignore it and try
+ +       * grab some new pages. The stock's nr_bytes will be flushed to
+ +       * objcg->nr_charged_bytes later on when objcg changes.
+ +       *
+ +       * The stock's nr_bytes may contain enough pre-charged bytes
+ +       * to allow one less page from being charged, but we can't rely
+ +       * on the pre-charged bytes not being changed outside of
+ +       * consume_obj_stock() or refill_obj_stock(). So ignore those
+ +       * pre-charged bytes as well when charging pages. To avoid a
+ +       * page uncharge right after a page charge, we set the
+ +       * allow_uncharge flag to false when calling refill_obj_stock()
+ +       * to temporarily allow the pre-charged bytes to exceed the page
+ +       * size limit. The maximum reachable value of the pre-charged
+ +       * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
+ +       * race.
          */
- -      rcu_read_lock();
- -retry:
- -      memcg = obj_cgroup_memcg(objcg);
- -      if (unlikely(!css_tryget(&memcg->css)))
- -              goto retry;
- -      rcu_read_unlock();
- -
         nr_pages = size >> PAGE_SHIFT;
         nr_bytes = size & (PAGE_SIZE - 1);
   
         if (nr_bytes)
                 nr_pages += 1;
   
- -      ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ +      ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
         if (!ret && nr_bytes)
- -              refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+ +              refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
   
- -      css_put(&memcg->css);
         return ret;
   }
   
   void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
   {
- -      refill_obj_stock(objcg, size);
+ +      refill_obj_stock(objcg, size, true);
   }
   
   #endif /* CONFIG_MEMCG_KMEM */
@@@ -3316,11 -3305,7 +3321,11 @@@ void split_page_memcg(struct page *head
   
         for (i = 1; i < nr; i++)
                 head[i].memcg_data = head->memcg_data;
- -      css_get_many(&memcg->css, nr - 1);
+ +
+ +      if (PageMemcgKmem(head))
+ +              obj_cgroup_get_many(__page_objcg(head), nr - 1);
+ +      else
+ +              css_get_many(&memcg->css, nr - 1);
   }
   
   #ifdef CONFIG_MEMCG_SWAP
@@@ -3569,7 -3554,6 +3574,7 @@@ static unsigned long mem_cgroup_usage(s
         unsigned long val;
   
         if (mem_cgroup_is_root(memcg)) {
+ +              cgroup_rstat_flush(memcg->css.cgroup);
                 val = memcg_page_state(memcg, NR_FILE_PAGES) +
                         memcg_page_state(memcg, NR_ANON_MAPPED);
                 if (swap)
@@@ -3634,6 -3618,57 +3639,6 @@@ static u64 mem_cgroup_read_u64(struct c
         }
   }
   
- -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
- -{
- -      unsigned long stat[MEMCG_NR_STAT] = {0};
- -      struct mem_cgroup *mi;
- -      int node, cpu, i;
- -
- -      for_each_online_cpu(cpu)
- -              for (i = 0; i < MEMCG_NR_STAT; i++)
- -                      stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
- -
- -      for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- -              for (i = 0; i < MEMCG_NR_STAT; i++)
- -                      atomic_long_add(stat[i], &mi->vmstats[i]);
- -
- -      for_each_node(node) {
- -              struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- -              struct mem_cgroup_per_node *pi;
- -
- -              for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- -                      stat[i] = 0;
- -
- -              for_each_online_cpu(cpu)
- -                      for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- -                              stat[i] += per_cpu(
- -                                      pn->lruvec_stat_cpu->count[i], cpu);
- -
- -              for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- -                      for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- -                              atomic_long_add(stat[i], &pi->lruvec_stat[i]);
- -      }
- -}
- -
- -static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
- -{
- -      unsigned long events[NR_VM_EVENT_ITEMS];
- -      struct mem_cgroup *mi;
- -      int cpu, i;
- -
- -      for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- -              events[i] = 0;
- -
- -      for_each_online_cpu(cpu)
- -              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- -                      events[i] += per_cpu(memcg->vmstats_percpu->events[i],
- -                                           cpu);
- -
- -      for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- -              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- -                      atomic_long_add(events[i], &mi->vmevents[i]);
- -}
- -
   #ifdef CONFIG_MEMCG_KMEM
   static int memcg_online_kmem(struct mem_cgroup *memcg)
   {
@@@ -3950,8 -3985,6 +3955,8 @@@ static int memcg_numa_stat_show(struct 
         int nid;
         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   
+ +      cgroup_rstat_flush(memcg->css.cgroup);
+ +
         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
                 seq_printf(m, "%s=%lu", stat->name,
                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
@@@ -4022,8 -4055,6 +4027,8 @@@ static int memcg_stat_show(struct seq_f
   
         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
   
+ +      cgroup_rstat_flush(memcg->css.cgroup);
+ +
         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                 unsigned long nr;
   
@@@ -4082,7 -4113,7 +4087,7 @@@
                 unsigned long file_cost = 0;
   
                 for_each_online_pgdat(pgdat) {
- -                      mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ +                      mz = memcg->nodeinfo[pgdat->node_id];
   
                         anon_cost += mz->lruvec.anon_cost;
                         file_cost += mz->lruvec.file_cost;
@@@ -4111,7 -4142,7 +4116,7 @@@ static int mem_cgroup_swappiness_write(
         if (val > 100)
                 return -EINVAL;
   
- -      if (css->parent)
+ +      if (!mem_cgroup_is_root(memcg))
                 memcg->swappiness = val;
         else
                 vm_swappiness = val;
@@@ -4461,7 -4492,7 +4466,7 @@@ static int mem_cgroup_oom_control_write
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   
         /* cannot set to root cgroup and only 0 and 1 are allowed */
- -      if (!css->parent || !((val == 0) || (val == 1)))
+ +      if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
                 return -EINVAL;
   
         memcg->oom_kill_disable = val;
@@@ -4500,6 -4531,22 +4505,6 @@@ struct wb_domain *mem_cgroup_wb_domain(
         return &memcg->cgwb_domain;
   }
   
- -/*
- - * idx can be of type enum memcg_stat_item or node_stat_item.
- - * Keep in sync with memcg_exact_page().
- - */
- -static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
- -{
- -      long x = atomic_long_read(&memcg->vmstats[idx]);
- -      int cpu;
- -
- -      for_each_online_cpu(cpu)
- -              x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
- -      if (x < 0)
- -              x = 0;
- -      return x;
- -}
- -
   /**
    * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
    * @wb: bdi_writeback in question
@@@ -4525,14 -4572,13 +4530,14 @@@ void mem_cgroup_wb_stats(struct bdi_wri
         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
         struct mem_cgroup *parent;
   
- -      *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+ +      cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
   
- -      *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- -      *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
- -                      memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
- -      *pheadroom = PAGE_COUNTER_MAX;
+ +      *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ +      *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ +      *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+ +                      memcg_page_state(memcg, NR_ACTIVE_FILE);
   
+ +      *pheadroom = PAGE_COUNTER_MAX;
         while ((parent = parent_mem_cgroup(memcg))) {
                 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
                                             READ_ONCE(memcg->memory.high));
@@@ -4547,7 -4593,7 +4552,7 @@@
    * Foreign dirty flushing
    *
    * There's an inherent mismatch between memcg and writeback.  The former
- - * trackes ownership per-page while the latter per-inode.  This was a
+ + * tracks ownership per-page while the latter per-inode.  This was a
    * deliberate design decision because honoring per-page ownership in the
    * writeback path is complicated, may lead to higher CPU and IO overheads
    * and deemed unnecessary given that write-sharing an inode across
@@@ -4562,9 -4608,9 +4567,9 @@@
    * triggering background writeback.  A will be slowed down without a way to
    * make writeback of the dirty pages happen.
    *
- - * Conditions like the above can lead to a cgroup getting repatedly and
+ + * Conditions like the above can lead to a cgroup getting repeatedly and
    * severely throttled after making some progress after each
- - * dirty_expire_interval while the underyling IO device is almost
+ + * dirty_expire_interval while the underlying IO device is almost
    * completely idle.
    *
    * Solving this problem completely requires matching the ownership tracking
@@@ -5164,20 -5210,19 +5169,20 @@@ static void __mem_cgroup_free(struct me
         for_each_node(node)
                 free_mem_cgroup_per_node_info(memcg, node);
         free_percpu(memcg->vmstats_percpu);
- -      free_percpu(memcg->vmstats_local);
         kfree(memcg);
   }
   
   static void mem_cgroup_free(struct mem_cgroup *memcg)
   {
+ +      int cpu;
+ +
         memcg_wb_domain_exit(memcg);
         /*
- -       * Flush percpu vmstats and vmevents to guarantee the value correctness
- -       * on parent's and all ancestor levels.
+ +       * Flush percpu lruvec stats to guarantee the value
+ +       * correctness on parent's and all ancestor levels.
          */
- -      memcg_flush_percpu_vmstats(memcg);
- -      memcg_flush_percpu_vmevents(memcg);
+ +      for_each_online_cpu(cpu)
+ +              memcg_flush_lruvec_page_state(memcg, cpu);
         __mem_cgroup_free(memcg);
   }
   
@@@ -5204,6 -5249,11 +5209,6 @@@ static struct mem_cgroup *mem_cgroup_al
                 goto fail;
         }
   
- -      memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
- -                                              GFP_KERNEL_ACCOUNT);
- -      if (!memcg->vmstats_local)
- -              goto fail;
- -
         memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
                                                  GFP_KERNEL_ACCOUNT);
         if (!memcg->vmstats_percpu)
@@@ -5301,11 -5351,11 +5306,11 @@@ static int mem_cgroup_css_online(struc
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   
         /*
- -       * A memcg must be visible for memcg_expand_shrinker_maps()
+ +       * A memcg must be visible for expand_shrinker_info()
          * by the time the maps are allocated. So, we allocate maps
          * here, when for_each_mem_cgroup() can't skip it.
          */
- -      if (memcg_alloc_shrinker_maps(memcg)) {
+ +      if (alloc_shrinker_info(memcg)) {
                 mem_cgroup_id_remove(memcg);
                 return -ENOMEM;
         }
@@@ -5337,7 -5387,6 +5342,7 @@@ static void mem_cgroup_css_offline(stru
         page_counter_set_low(&memcg->memory, 0);
   
         memcg_offline_kmem(memcg);
+ +      reparent_shrinker_deferred(memcg);
         wb_memcg_offline(memcg);
   
         drain_all_stock(memcg);
@@@ -5370,7 -5419,7 +5375,7 @@@ static void mem_cgroup_css_free(struct 
         vmpressure_cleanup(&memcg->vmpressure);
         cancel_work_sync(&memcg->high_work);
         mem_cgroup_remove_from_trees(memcg);
- -      memcg_free_shrinker_maps(memcg);
+ +      free_shrinker_info(memcg);
         memcg_free_kmem(memcg);
         mem_cgroup_free(memcg);
   }
@@@ -5404,62 -5453,6 +5409,62 @@@ static void mem_cgroup_css_reset(struc
         memcg_wb_domain_size_changed(memcg);
   }
   
+ +static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+ +{
+ +      struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ +      struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ +      struct memcg_vmstats_percpu *statc;
+ +      long delta, v;
+ +      int i;
+ +
+ +      statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+ +
+ +      for (i = 0; i < MEMCG_NR_STAT; i++) {
+ +              /*
+ +               * Collect the aggregated propagation counts of groups
+ +               * below us. We're in a per-cpu loop here and this is
+ +               * a global counter, so the first cycle will get them.
+ +               */
+ +              delta = memcg->vmstats.state_pending[i];
+ +              if (delta)
+ +                      memcg->vmstats.state_pending[i] = 0;
+ +
+ +              /* Add CPU changes on this level since the last flush */
+ +              v = READ_ONCE(statc->state[i]);
+ +              if (v != statc->state_prev[i]) {
+ +                      delta += v - statc->state_prev[i];
+ +                      statc->state_prev[i] = v;
+ +              }
+ +
+ +              if (!delta)
+ +                      continue;
+ +
+ +              /* Aggregate counts on this level and propagate upwards */
+ +              memcg->vmstats.state[i] += delta;
+ +              if (parent)
+ +                      parent->vmstats.state_pending[i] += delta;
+ +      }
+ +
+ +      for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ +              delta = memcg->vmstats.events_pending[i];
+ +              if (delta)
+ +                      memcg->vmstats.events_pending[i] = 0;
+ +
+ +              v = READ_ONCE(statc->events[i]);
+ +              if (v != statc->events_prev[i]) {
+ +                      delta += v - statc->events_prev[i];
+ +                      statc->events_prev[i] = v;
+ +              }
+ +
+ +              if (!delta)
+ +                      continue;
+ +
+ +              memcg->vmstats.events[i] += delta;
+ +              if (parent)
+ +                      parent->vmstats.events_pending[i] += delta;
+ +      }
+ +}
+ +
   #ifdef CONFIG_MMU
   /* Handlers for move charge at task migration. */
   static int mem_cgroup_do_precharge(unsigned long count)
@@@ -5957,7 -5950,7 +5962,7 @@@ static int mem_cgroup_can_attach(struc
                 return 0;
   
         /*
- -       * We are now commited to this value whatever it is. Changes in this
+ +       * We are now committed to this value whatever it is. Changes in this
          * tunable will only affect upcoming migrations, not the current one.
          * So we need to save it, and keep it going.
          */
@@@ -6513,7 -6506,6 +6518,7 @@@ struct cgroup_subsys memory_cgrp_subsy
         .css_released = mem_cgroup_css_released,
         .css_free = mem_cgroup_css_free,
         .css_reset = mem_cgroup_css_reset,
+ +      .css_rstat_flush = mem_cgroup_css_rstat_flush,
         .can_attach = mem_cgroup_can_attach,
         .cancel_attach = mem_cgroup_cancel_attach,
         .post_attach = mem_cgroup_move_task,
@@@ -6696,27 -6688,6 +6701,27 @@@ void mem_cgroup_calculate_protection(st
                         atomic_long_read(&parent->memory.children_low_usage)));
   }
   
+ +static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+ +                             gfp_t gfp)
+ +{
+ +      unsigned int nr_pages = thp_nr_pages(page);
+ +      int ret;
+ +
+ +      ret = try_charge(memcg, gfp, nr_pages);
+ +      if (ret)
+ +              goto out;
+ +
+ +      css_get(&memcg->css);
+ +      commit_charge(page, memcg);
+ +
+ +      local_irq_disable();
+ +      mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ +      memcg_check_events(memcg, page);
+ +      local_irq_enable();
+ +out:
+ +      return ret;
+ +}
+ +
   /**
    * mem_cgroup_charge - charge a newly allocated page to a cgroup
    * @page: page to charge
@@@ -6724,74 -6695,57 +6729,74 @@@
    * @gfp_mask: reclaim mode
    *
    * Try to charge @page to the memcg that @mm belongs to, reclaiming
- - * pages according to @gfp_mask if necessary.
+ + * pages according to @gfp_mask if necessary. if @mm is NULL, try to
+ + * charge to the active memcg.
+ + *
+ + * Do not use this for pages allocated for swapin.
    *
    * Returns 0 on success. Otherwise, an error code is returned.
    */
   int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
   {
- -      unsigned int nr_pages = thp_nr_pages(page);
- -      struct mem_cgroup *memcg = NULL;
- -      int ret = 0;
+ +      struct mem_cgroup *memcg;
+ +      int ret;
   
         if (mem_cgroup_disabled())
- -              goto out;
+ +              return 0;
   
- -      if (PageSwapCache(page)) {
- -              swp_entry_t ent = { .val = page_private(page), };
- -              unsigned short id;
+ +      memcg = get_mem_cgroup_from_mm(mm);
+ +      ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+ +      css_put(&memcg->css);
   
- -              /*
- -               * Every swap fault against a single page tries to charge the
- -               * page, bail as early as possible.  shmem_unuse() encounters
- -               * already charged pages, too.  page and memcg binding is
- -               * protected by the page lock, which serializes swap cache
- -               * removal, which in turn serializes uncharging.
- -               */
- -              VM_BUG_ON_PAGE(!PageLocked(page), page);
- -              if (page_memcg(compound_head(page)))
- -                      goto out;
+ +      return ret;
+ +}
   
- -              id = lookup_swap_cgroup_id(ent);
- -              rcu_read_lock();
- -              memcg = mem_cgroup_from_id(id);
- -              if (memcg && !css_tryget_online(&memcg->css))
- -                      memcg = NULL;
- -              rcu_read_unlock();
- -      }
+ +/**
+ + * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
+ + * @page: page to charge
+ + * @mm: mm context of the victim
+ + * @gfp: reclaim mode
+ + * @entry: swap entry for which the page is allocated
+ + *
+ + * This function charges a page allocated for swapin. Please call this before
+ + * adding the page to the swapcache.
+ + *
+ + * Returns 0 on success. Otherwise, an error code is returned.
+ + */
+ +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+ +                                gfp_t gfp, swp_entry_t entry)
+ +{
+ +      struct mem_cgroup *memcg;
+ +      unsigned short id;
+ +      int ret;
   
- -      if (!memcg)
- -              memcg = get_mem_cgroup_from_mm(mm);
+ +      if (mem_cgroup_disabled())
+ +              return 0;
   
- -      ret = try_charge(memcg, gfp_mask, nr_pages);
- -      if (ret)
- -              goto out_put;
+ +      id = lookup_swap_cgroup_id(entry);
+ +      rcu_read_lock();
+ +      memcg = mem_cgroup_from_id(id);
+ +      if (!memcg || !css_tryget_online(&memcg->css))
+ +              memcg = get_mem_cgroup_from_mm(mm);
+ +      rcu_read_unlock();
   
- -      css_get(&memcg->css);
- -      commit_charge(page, memcg);
+ +      ret = __mem_cgroup_charge(page, memcg, gfp);
   
- -      local_irq_disable();
- -      mem_cgroup_charge_statistics(memcg, page, nr_pages);
- -      memcg_check_events(memcg, page);
- -      local_irq_enable();
+ +      css_put(&memcg->css);
+ +      return ret;
+ +}
   
+ +/*
+ + * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ + * @entry: swap entry for which the page is charged
+ + *
+ + * Call this function after successfully adding the charged page to swapcache.
+ + *
+ + * Note: This function assumes the page for which swap slot is being uncharged
+ + * is order 0 page.
+ + */
+ +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+ +{
         /*
          * Cgroup1's unified memory+swap counter has been charged with the
          * new swapcache page, finish the transfer by uncharging the swap
@@@ -6804,19 -6758,25 +6809,19 @@@
          * correspond 1:1 to page and swap slot lifetimes: we charge the
          * page to memory here, and uncharge swap when the slot is freed.
          */
- -      if (do_memsw_account() && PageSwapCache(page)) {
- -              swp_entry_t entry = { .val = page_private(page) };
+ +      if (!mem_cgroup_disabled() && do_memsw_account()) {
                 /*
                  * The swap entry might not get freed for a long time,
                  * let's not wait for it.  The page already received a
                  * memory+swap charge, drop the swap entry duplicate.
                  */
- -              mem_cgroup_uncharge_swap(entry, nr_pages);
+ +              mem_cgroup_uncharge_swap(entry, 1);
         }
- -
- -out_put:
- -      css_put(&memcg->css);
- -out:
- -      return ret;
   }
   
   struct uncharge_gather {
         struct mem_cgroup *memcg;
- -      unsigned long nr_pages;
+ +      unsigned long nr_memory;
         unsigned long pgpgout;
         unsigned long nr_kmem;
         struct page *dummy_page;
@@@ -6831,10 -6791,10 +6836,10 @@@ static void uncharge_batch(const struc
   {
         unsigned long flags;
   
- -      if (!mem_cgroup_is_root(ug->memcg)) {
- -              page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+ +      if (ug->nr_memory) {
+ +              page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
                 if (do_memsw_account())
- -                      page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+ +                      page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
                 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
                         page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
                 memcg_oom_recover(ug->memcg);
@@@ -6842,7 -6802,7 +6847,7 @@@
   
         local_irq_save(flags);
         __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- -      __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
+ +      __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
         memcg_check_events(ug->memcg, ug->dummy_page);
         local_irq_restore(flags);
   
@@@ -6853,61 -6813,40 +6858,61 @@@
   static void uncharge_page(struct page *page, struct uncharge_gather *ug)
   {
         unsigned long nr_pages;
+ +      struct mem_cgroup *memcg;
+ +      struct obj_cgroup *objcg;
+ +      bool use_objcg = PageMemcgKmem(page);
   
         VM_BUG_ON_PAGE(PageLRU(page), page);
   
- -      if (!page_memcg(page))
- -              return;
- -
         /*
          * Nobody should be changing or seriously looking at
- -       * page_memcg(page) at this point, we have fully
+ +       * page memcg or objcg at this point, we have fully
          * exclusive access to the page.
          */
+ +      if (use_objcg) {
+ +              objcg = __page_objcg(page);
+ +              /*
+ +               * This get matches the put at the end of the function and
+ +               * kmem pages do not hold memcg references anymore.
+ +               */
+ +              memcg = get_mem_cgroup_from_objcg(objcg);
+ +      } else {
+ +              memcg = __page_memcg(page);
+ +      }
+ +
+ +      if (!memcg)
+ +              return;
   
- -      if (ug->memcg != page_memcg(page)) {
+ +      if (ug->memcg != memcg) {
                 if (ug->memcg) {
                         uncharge_batch(ug);
                         uncharge_gather_clear(ug);
                 }
- -              ug->memcg = page_memcg(page);
+ +              ug->memcg = memcg;
+ +              ug->dummy_page = page;
   
                 /* pairs with css_put in uncharge_batch */
- -              css_get(&ug->memcg->css);
+ +              css_get(&memcg->css);
         }
   
         nr_pages = compound_nr(page);
- -      ug->nr_pages += nr_pages;
   
- -      if (PageMemcgKmem(page))
+ +      if (use_objcg) {
+ +              ug->nr_memory += nr_pages;
                 ug->nr_kmem += nr_pages;
- -      else
+ +
+ +              page->memcg_data = 0;
+ +              obj_cgroup_put(objcg);
+ +      } else {
+ +              /* LRU pages aren't accounted at the root level */
+ +              if (!mem_cgroup_is_root(memcg))
+ +                      ug->nr_memory += nr_pages;
                 ug->pgpgout++;
   
- -      ug->dummy_page = page;
- -      page->memcg_data = 0;
- -      css_put(&ug->memcg->css);
+ +              page->memcg_data = 0;
+ +      }
+ +
+ +      css_put(&memcg->css);
   }
   
   /**
@@@ -6991,11 -6930,9 +6996,11 @@@ void mem_cgroup_migrate(struct page *ol
         /* Force-charge the new page. The old one will be freed soon */
         nr_pages = thp_nr_pages(newpage);
   
- -      page_counter_charge(&memcg->memory, nr_pages);
- -      if (do_memsw_account())
- -              page_counter_charge(&memcg->memsw, nr_pages);
+ +      if (!mem_cgroup_is_root(memcg)) {
+ +              page_counter_charge(&memcg->memory, nr_pages);
+ +              if (do_memsw_account())
+ +                      page_counter_charge(&memcg->memsw, nr_pages);
+ +      }
   
         css_get(&memcg->css);
         commit_charge(newpage, memcg);
diff --combined mm/percpu-internal.h

index ae26b118e246bf85eeb43ba78b9f29a1ea674fe9,b6dc2290408840daeca44eb60f03013d62f670d1..639662c20c821be42a50a66f0a2ed1df013c93b3
--- 1/mm/percpu-internal.h
--- 2/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@@ -5,25 -5,6 +5,6 @@@
   #include <linux/types.h>
   #include <linux/percpu.h>
   
- /*
-  * There are two chunk types: root and memcg-aware.
-  * Chunks of each type have separate slots list.
-  *
-  * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
-  * used to store memcg membership data of a percpu object.  Obj_cgroups are
-  * ref-counted pointers to a memory cgroup with an ability to switch dynamically
-  * to the parent memory cgroup.  This allows to reclaim a deleted memory cgroup
-  * without reclaiming of all outstanding objects, which hold a reference at it.
-  */
- enum pcpu_chunk_type {
-       PCPU_CHUNK_ROOT,
- #ifdef CONFIG_MEMCG_KMEM
-       PCPU_CHUNK_MEMCG,
- #endif
-       PCPU_NR_CHUNK_TYPES,
-       PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
- };
- 
   /*
    * pcpu_block_md is the metadata block struct.
    * Each chunk's bitmap is split into a number of full blocks.
@@@ -67,6 -48,8 +48,8 @@@ struct pcpu_chunk 
   
         void                    *data;          /* chunk data */
         bool                    immutable;      /* no [de]population allowed */
+       bool                    isolated;       /* isolated from active chunk
+                                                  slots */
         int                     start_offset;   /* the overlap with the previous
                                                    region to have a page aligned
                                                    base_addr */
@@@ -87,7 -70,9 +70,9 @@@ extern spinlock_t pcpu_lock
   
   extern struct list_head *pcpu_chunk_lists;
   extern int pcpu_nr_slots;
- extern int pcpu_nr_empty_pop_pages[];
+ extern int pcpu_sidelined_slot;
+ extern int pcpu_to_depopulate_slot;
+ extern int pcpu_nr_empty_pop_pages;
   
   extern struct pcpu_chunk *pcpu_first_chunk;
   extern struct pcpu_chunk *pcpu_reserved_chunk;
@@@ -128,37 -113,6 +113,6 @@@ static inline int pcpu_chunk_map_bits(s
         return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
   }
   
- #ifdef CONFIG_MEMCG_KMEM
- static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
- {
-       if (chunk->obj_cgroups)
-               return PCPU_CHUNK_MEMCG;
-       return PCPU_CHUNK_ROOT;
- }
- 
- static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
- {
-       return chunk_type == PCPU_CHUNK_MEMCG;
- }
- 
- #else
- static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
- {
-       return PCPU_CHUNK_ROOT;
- }
- 
- static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
- {
-       return false;
- }
- #endif
- 
- static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
- {
-       return &pcpu_chunk_lists[pcpu_nr_slots *
-                                pcpu_is_memcg_chunk(chunk_type)];
- }
- 
   #ifdef CONFIG_PERCPU_STATS
   
   #include <linux/spinlock.h>
@@@ -170,7 -124,7 +124,7 @@@ struct percpu_stats 
         u64 nr_max_alloc;       /* max # of live allocations */
         u32 nr_chunks;          /* current # of live chunks */
         u32 nr_max_chunks;      /* max # of live chunks */
- -      size_t min_alloc_size;  /* min allocaiton size */
+ +      size_t min_alloc_size;  /* min allocation size */
         size_t max_alloc_size;  /* max allocation size */
   };
   
diff --combined mm/percpu-vm.c

index 8d3844bc0c7cf8a5e0a7b9b49573ee50576d58a4,057546f5555e6dd005cbdc3880d8f5f776493621..ee5d89fcd66f2f9f6ab305806b64588f2456c8e0
--- 1/mm/percpu-vm.c
--- 2/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@@ -8,7 -8,6 +8,7 @@@
    * Chunks are mapped into vmalloc areas and populated page by page.
    * This is the default chunk allocator.
    */
+ +#include "internal.h"
   
   static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
@@@ -134,7 -133,7 +134,7 @@@ static void pcpu_pre_unmap_flush(struc
   
   static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
   {
- -      unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ +      vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
   }
   
   /**
@@@ -193,8 -192,8 +193,8 @@@ static void pcpu_post_unmap_tlb_flush(s
   static int __pcpu_map_pages(unsigned long addr, struct page **pages,
                             int nr_pages)
   {
- -      return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- -                                      PAGE_KERNEL, pages);
+ +      return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
+ +                                      PAGE_KERNEL, pages, PAGE_SHIFT);
   }
   
   /**
@@@ -329,13 -328,12 +329,12 @@@ static void pcpu_depopulate_chunk(struc
         pcpu_free_pages(chunk, pages, page_start, page_end);
   }
   
- static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
-                                           gfp_t gfp)
+ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
   {
         struct pcpu_chunk *chunk;
         struct vm_struct **vms;
   
-       chunk = pcpu_alloc_chunk(type, gfp);
+       chunk = pcpu_alloc_chunk(gfp);
         if (!chunk)
                 return NULL;
   
@@@ -378,3 -376,33 +377,33 @@@ static int __init pcpu_verify_alloc_inf
         /* no extra restriction */
         return 0;
   }
+ 
+ /**
+  * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
+  * @chunk: chunk of interest
+  *
+  * This is the entry point for percpu reclaim.  If a chunk qualifies, it is then
+  * isolated and managed in separate lists at the back of pcpu_slot: sidelined
+  * and to_depopulate respectively.  The to_depopulate list holds chunks slated
+  * for depopulation.  They no longer contribute to pcpu_nr_empty_pop_pages once
+  * they are on this list.  Once depopulated, they are moved onto the sidelined
+  * list which enables them to be pulled back in for allocation if no other chunk
+  * can suffice the allocation.
+  */
+ static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+ {
+       /* do not reclaim either the first chunk or reserved chunk */
+       if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
+               return false;
+ 
+       /*
+        * If it is isolated, it may be on the sidelined list so move it back to
+        * the to_depopulate list.  If we hit at least 1/4 pages empty pages AND
+        * there is no system-wide shortage of empty pages aside from this
+        * chunk, move it to the to_depopulate list.
+        */
+       return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
+               (pcpu_nr_empty_pop_pages >
+                (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) &&
+                chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
+ }
diff --combined mm/percpu.c

index f99e9306b9394a5f74def05b814e675cca6f9efd,f4c83217f2175dfb5cf76a466ee11243fe837850..b4cebeca4c0c1616ef44d0f95526bc09417fb06e
--- 1/mm/percpu.c
--- 2/mm/percpu.c
+++ b/mm/percpu.c
@@@ -99,7 -99,10 +99,10 @@@
   
   #include "percpu-internal.h"
   
- /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+ /*
+  * The slots are sorted by the size of the biggest continuous free area.
+  * 1-31 bytes share the same slot.
+  */
   #define PCPU_SLOT_BASE_SHIFT          5
   /* chunks in slots below this are subject to being sidelined on failed alloc */
   #define PCPU_SLOT_FAIL_THRESHOLD      3
@@@ -132,6 -135,9 +135,9 @@@ static int pcpu_unit_size __ro_after_in
   static int pcpu_nr_units __ro_after_init;
   static int pcpu_atom_size __ro_after_init;
   int pcpu_nr_slots __ro_after_init;
+ static int pcpu_free_slot __ro_after_init;
+ int pcpu_sidelined_slot __ro_after_init;
+ int pcpu_to_depopulate_slot __ro_after_init;
   static size_t pcpu_chunk_struct_size __ro_after_init;
   
   /* cpus with the lowest and highest unit addresses */
@@@ -173,10 -179,10 +179,10 @@@ struct list_head *pcpu_chunk_lists __ro
   static LIST_HEAD(pcpu_map_extend_chunks);
   
   /*
-  * The number of empty populated pages by chunk type, protected by pcpu_lock.
+  * The number of empty populated pages, protected by pcpu_lock.
    * The reserved chunk doesn't contribute to the count.
    */
- int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
+ int pcpu_nr_empty_pop_pages;
   
   /*
    * The number of populated pages in use by the allocator, protected by
@@@ -234,7 -240,7 +240,7 @@@ static int __pcpu_size_to_slot(int size
   static int pcpu_size_to_slot(int size)
   {
         if (size == pcpu_unit_size)
-               return pcpu_nr_slots - 1;
+               return pcpu_free_slot;
         return __pcpu_size_to_slot(size);
   }
   
@@@ -303,6 -309,25 +309,25 @@@ static unsigned long pcpu_block_off_to_
         return index * PCPU_BITMAP_BLOCK_BITS + off;
   }
   
+ /**
+  * pcpu_check_block_hint - check against the contig hint
+  * @block: block of interest
+  * @bits: size of allocation
+  * @align: alignment of area (max PAGE_SIZE)
+  *
+  * Check to see if the allocation can fit in the block's contig hint.
+  * Note, a chunk uses the same hints as a block so this can also check against
+  * the chunk's contig hint.
+  */
+ static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
+                                 size_t align)
+ {
+       int bit_off = ALIGN(block->contig_hint_start, align) -
+               block->contig_hint_start;
+ 
+       return bit_off + bits <= block->contig_hint;
+ }
+ 
   /*
    * pcpu_next_hint - determine which hint to use
    * @block: block of interest
@@@ -507,13 -532,10 +532,10 @@@ static void __pcpu_chunk_move(struct pc
                               bool move_front)
   {
         if (chunk != pcpu_reserved_chunk) {
-               struct list_head *pcpu_slot;
- 
-               pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
                 if (move_front)
-                       list_move(&chunk->list, &pcpu_slot[slot]);
+                       list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                 else
-                       list_move_tail(&chunk->list, &pcpu_slot[slot]);
+                       list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
         }
   }
   
@@@ -539,10 -561,36 +561,36 @@@ static void pcpu_chunk_relocate(struct 
   {
         int nslot = pcpu_chunk_slot(chunk);
   
+       /* leave isolated chunks in-place */
+       if (chunk->isolated)
+               return;
+ 
         if (oslot != nslot)
                 __pcpu_chunk_move(chunk, nslot, oslot < nslot);
   }
   
+ static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
+ {
+       lockdep_assert_held(&pcpu_lock);
+ 
+       if (!chunk->isolated) {
+               chunk->isolated = true;
+               pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
+       }
+       list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
+ }
+ 
+ static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
+ {
+       lockdep_assert_held(&pcpu_lock);
+ 
+       if (chunk->isolated) {
+               chunk->isolated = false;
+               pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
+               pcpu_chunk_relocate(chunk, -1);
+       }
+ }
+ 
   /*
    * pcpu_update_empty_pages - update empty page counters
    * @chunk: chunk of interest
@@@ -555,8 -603,8 +603,8 @@@
   static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
   {
         chunk->nr_empty_pop_pages += nr;
-       if (chunk != pcpu_reserved_chunk)
-               pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
+       if (chunk != pcpu_reserved_chunk && !chunk->isolated)
+               pcpu_nr_empty_pop_pages += nr;
   }
   
   /*
@@@ -1063,14 -1111,11 +1111,11 @@@ static int pcpu_find_block_fit(struct p
         int bit_off, bits, next_off;
   
         /*
-        * Check to see if the allocation can fit in the chunk's contig hint.
-        * This is an optimization to prevent scanning by assuming if it
-        * cannot fit in the global hint, there is memory pressure and creating
-        * a new chunk would happen soon.
+        * This is an optimization to prevent scanning by assuming if the
+        * allocation cannot fit in the global hint, there is memory pressure
+        * and creating a new chunk would happen soon.
          */
-       bit_off = ALIGN(chunk_md->contig_hint_start, align) -
-                 chunk_md->contig_hint_start;
-       if (bit_off + alloc_bits > chunk_md->contig_hint)
+       if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                 return -1;
   
         bit_off = pcpu_next_hint(chunk_md, alloc_bits);
@@@ -1352,7 -1397,7 +1397,7 @@@ static struct pcpu_chunk * __init pcpu_
                       alloc_size);
   
   #ifdef CONFIG_MEMCG_KMEM
-       /* first chunk isn't memcg-aware */
+       /* first chunk is free to use */
         chunk->obj_cgroups = NULL;
   #endif
         pcpu_init_md_blocks(chunk);
@@@ -1394,7 -1439,7 +1439,7 @@@
         return chunk;
   }
   
- static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
+ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
   {
         struct pcpu_chunk *chunk;
         int region_bits;
@@@ -1423,7 -1468,7 +1468,7 @@@
                 goto md_blocks_fail;
   
   #ifdef CONFIG_MEMCG_KMEM
-       if (pcpu_is_memcg_chunk(type)) {
+       if (!mem_cgroup_kmem_disabled()) {
                 chunk->obj_cgroups =
                         pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                         sizeof(struct obj_cgroup *), gfp);
@@@ -1536,8 -1581,7 +1581,7 @@@ static int pcpu_populate_chunk(struct p
                                int page_start, int page_end, gfp_t gfp);
   static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                   int page_start, int page_end);
- static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
-                                           gfp_t gfp);
+ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
   static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
   static struct page *pcpu_addr_to_page(void *addr);
   static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@@ -1580,25 -1624,25 +1624,25 @@@ static struct pcpu_chunk *pcpu_chunk_ad
   }
   
   #ifdef CONFIG_MEMCG_KMEM
- static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
-                                                    struct obj_cgroup **objcgp)
+ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+                                     struct obj_cgroup **objcgp)
   {
         struct obj_cgroup *objcg;
   
         if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
-               return PCPU_CHUNK_ROOT;
+               return true;
   
         objcg = get_obj_cgroup_from_current();
         if (!objcg)
-               return PCPU_CHUNK_ROOT;
+               return true;
   
         if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
                 obj_cgroup_put(objcg);
-               return PCPU_FAIL_ALLOC;
+               return false;
         }
   
         *objcgp = objcg;
-       return PCPU_CHUNK_MEMCG;
+       return true;
   }
   
   static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@@ -1608,7 -1652,7 +1652,7 @@@
         if (!objcg)
                 return;
   
-       if (chunk) {
+       if (likely(chunk && chunk->obj_cgroups)) {
                 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
   
                 rcu_read_lock();
@@@ -1625,10 -1669,12 +1669,12 @@@ static void pcpu_memcg_free_hook(struc
   {
         struct obj_cgroup *objcg;
   
-       if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+       if (unlikely(!chunk->obj_cgroups))
                 return;
   
         objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+       if (!objcg)
+               return;
         chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
   
         obj_cgroup_uncharge(objcg, size * num_possible_cpus());
@@@ -1642,10 -1688,10 +1688,10 @@@
   }
   
   #else /* CONFIG_MEMCG_KMEM */
- static enum pcpu_chunk_type
+ static bool
   pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
   {
-       return PCPU_CHUNK_ROOT;
+       return true;
   }
   
   static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@@ -1680,8 -1726,6 +1726,6 @@@ static void __percpu *pcpu_alloc(size_
         gfp_t pcpu_gfp;
         bool is_atomic;
         bool do_warn;
-       enum pcpu_chunk_type type;
-       struct list_head *pcpu_slot;
         struct obj_cgroup *objcg = NULL;
         static int warn_limit = 10;
         struct pcpu_chunk *chunk, *next;
@@@ -1717,10 -1761,8 +1761,8 @@@
                 return NULL;
         }
   
-       type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
-       if (unlikely(type == PCPU_FAIL_ALLOC))
+       if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                 return NULL;
-       pcpu_slot = pcpu_chunk_list(type);
   
         if (!is_atomic) {
                 /*
@@@ -1758,8 -1800,9 +1800,9 @@@
   
   restart:
         /* search through normal chunks */
-       for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
-               list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
+       for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
+               list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
+                                        list) {
                         off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                   is_atomic);
                         if (off < 0) {
@@@ -1769,9 -1812,10 +1812,10 @@@
                         }
   
                         off = pcpu_alloc_area(chunk, bits, bit_align, off);
-                       if (off >= 0)
+                       if (off >= 0) {
+                               pcpu_reintegrate_chunk(chunk);
                                 goto area_found;
- 
+                       }
                 }
         }
   
@@@ -1787,8 -1831,8 +1831,8 @@@
                 goto fail;
         }
   
-       if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-               chunk = pcpu_create_chunk(type, pcpu_gfp);
+       if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
+               chunk = pcpu_create_chunk(pcpu_gfp);
                 if (!chunk) {
                         err = "failed to allocate new chunk";
                         goto fail;
@@@ -1832,7 -1876,7 +1876,7 @@@ area_found
                 mutex_unlock(&pcpu_alloc_mutex);
         }
   
-       if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
+       if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                 pcpu_schedule_balance_work();
   
         /* clear the areas and return address relative to base address */
@@@ -1862,7 -1906,7 +1906,7 @@@ fail
                         pr_info("limit reached, disable warning\n");
         }
         if (is_atomic) {
- -              /* see the flag handling in pcpu_blance_workfn() */
+ +              /* see the flag handling in pcpu_balance_workfn() */
                 pcpu_atomic_alloc_failed = true;
                 pcpu_schedule_balance_work();
         } else {
@@@ -1930,33 -1974,28 +1974,28 @@@ void __percpu *__alloc_reserved_percpu(
   }
   
   /**
-  * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
-  * @type: chunk type
+  * pcpu_balance_free - manage the amount of free chunks
+  * @empty_only: free chunks only if there are no populated pages
    *
-  * Reclaim all fully free chunks except for the first one.  This is also
-  * responsible for maintaining the pool of empty populated pages.  However,
-  * it is possible that this is called when physical memory is scarce causing
-  * OOM killer to be triggered.  We should avoid doing so until an actual
-  * allocation causes the failure as it is possible that requests can be
-  * serviced from already backed regions.
+  * If empty_only is %false, reclaim all fully free chunks regardless of the
+  * number of populated pages.  Otherwise, only reclaim chunks that have no
+  * populated pages.
+  *
+  * CONTEXT:
+  * pcpu_lock (can be dropped temporarily)
    */
- static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
+ static void pcpu_balance_free(bool empty_only)
   {
-       /* gfp flags passed to underlying allocators */
-       const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
         LIST_HEAD(to_free);
-       struct list_head *pcpu_slot = pcpu_chunk_list(type);
-       struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
+       struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
         struct pcpu_chunk *chunk, *next;
-       int slot, nr_to_pop, ret;
+ 
+       lockdep_assert_held(&pcpu_lock);
   
         /*
          * There's no reason to keep around multiple unused chunks and VM
          * areas can be scarce.  Destroy all free chunks except for one.
          */
-       mutex_lock(&pcpu_alloc_mutex);
-       spin_lock_irq(&pcpu_lock);
- 
         list_for_each_entry_safe(chunk, next, free_head, list) {
                 WARN_ON(chunk->immutable);
   
@@@ -1964,11 -2003,14 +2003,14 @@@
                 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                         continue;
   
-               list_move(&chunk->list, &to_free);
+               if (!empty_only || chunk->nr_empty_pop_pages == 0)
+                       list_move(&chunk->list, &to_free);
         }
   
-       spin_unlock_irq(&pcpu_lock);
+       if (list_empty(&to_free))
+               return;
   
+       spin_unlock_irq(&pcpu_lock);
         list_for_each_entry_safe(chunk, next, &to_free, list) {
                 unsigned int rs, re;
   
@@@ -1982,6 -2024,29 +2024,29 @@@
                 pcpu_destroy_chunk(chunk);
                 cond_resched();
         }
+       spin_lock_irq(&pcpu_lock);
+ }
+ 
+ /**
+  * pcpu_balance_populated - manage the amount of populated pages
+  *
+  * Maintain a certain amount of populated pages to satisfy atomic allocations.
+  * It is possible that this is called when physical memory is scarce causing
+  * OOM killer to be triggered.  We should avoid doing so until an actual
+  * allocation causes the failure as it is possible that requests can be
+  * serviced from already backed regions.
+  *
+  * CONTEXT:
+  * pcpu_lock (can be dropped temporarily)
+  */
+ static void pcpu_balance_populated(void)
+ {
+       /* gfp flags passed to underlying allocators */
+       const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+       struct pcpu_chunk *chunk;
+       int slot, nr_to_pop, ret;
+ 
+       lockdep_assert_held(&pcpu_lock);
   
         /*
          * Ensure there are certain number of free populated pages for
@@@ -2000,23 -2065,21 +2065,21 @@@ retry_pop
                 pcpu_atomic_alloc_failed = false;
         } else {
                 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
-                                 pcpu_nr_empty_pop_pages[type],
+                                 pcpu_nr_empty_pop_pages,
                                   0, PCPU_EMPTY_POP_PAGES_HIGH);
         }
   
-       for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+       for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                 unsigned int nr_unpop = 0, rs, re;
   
                 if (!nr_to_pop)
                         break;
   
-               spin_lock_irq(&pcpu_lock);
-               list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+               list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                         nr_unpop = chunk->nr_pages - chunk->nr_populated;
                         if (nr_unpop)
                                 break;
                 }
-               spin_unlock_irq(&pcpu_lock);
   
                 if (!nr_unpop)
                         continue;
@@@ -2026,12 -2089,13 +2089,13 @@@
                                              chunk->nr_pages) {
                         int nr = min_t(int, re - rs, nr_to_pop);
   
+                       spin_unlock_irq(&pcpu_lock);
                         ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+                       cond_resched();
+                       spin_lock_irq(&pcpu_lock);
                         if (!ret) {
                                 nr_to_pop -= nr;
-                               spin_lock_irq(&pcpu_lock);
                                 pcpu_chunk_populated(chunk, rs, rs + nr);
-                               spin_unlock_irq(&pcpu_lock);
                         } else {
                                 nr_to_pop = 0;
                         }
@@@ -2043,30 -2107,133 +2107,133 @@@
   
         if (nr_to_pop) {
                 /* ran out of chunks to populate, create a new one and retry */
-               chunk = pcpu_create_chunk(type, gfp);
+               spin_unlock_irq(&pcpu_lock);
+               chunk = pcpu_create_chunk(gfp);
+               cond_resched();
+               spin_lock_irq(&pcpu_lock);
                 if (chunk) {
-                       spin_lock_irq(&pcpu_lock);
                         pcpu_chunk_relocate(chunk, -1);
-                       spin_unlock_irq(&pcpu_lock);
                         goto retry_pop;
                 }
         }
+ }
   
-       mutex_unlock(&pcpu_alloc_mutex);
+ /**
+  * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
+  *
+  * Scan over chunks in the depopulate list and try to release unused populated
+  * pages back to the system.  Depopulated chunks are sidelined to prevent
+  * repopulating these pages unless required.  Fully free chunks are reintegrated
+  * and freed accordingly (1 is kept around).  If we drop below the empty
+  * populated pages threshold, reintegrate the chunk if it has empty free pages.
+  * Each chunk is scanned in the reverse order to keep populated pages close to
+  * the beginning of the chunk.
+  *
+  * CONTEXT:
+  * pcpu_lock (can be dropped temporarily)
+  *
+  */
+ static void pcpu_reclaim_populated(void)
+ {
+       struct pcpu_chunk *chunk;
+       struct pcpu_block_md *block;
+       int i, end;
+ 
+       lockdep_assert_held(&pcpu_lock);
+ 
+ restart:
+       /*
+        * Once a chunk is isolated to the to_depopulate list, the chunk is no
+        * longer discoverable to allocations whom may populate pages.  The only
+        * other accessor is the free path which only returns area back to the
+        * allocator not touching the populated bitmap.
+        */
+       while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) {
+               chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot],
+                                        struct pcpu_chunk, list);
+               WARN_ON(chunk->immutable);
+ 
+               /*
+                * Scan chunk's pages in the reverse order to keep populated
+                * pages close to the beginning of the chunk.
+                */
+               for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
+                       /* no more work to do */
+                       if (chunk->nr_empty_pop_pages == 0)
+                               break;
+ 
+                       /* reintegrate chunk to prevent atomic alloc failures */
+                       if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+                               pcpu_reintegrate_chunk(chunk);
+                               goto restart;
+                       }
+ 
+                       /*
+                        * If the page is empty and populated, start or
+                        * extend the (i, end) range.  If i == 0, decrease
+                        * i and perform the depopulation to cover the last
+                        * (first) page in the chunk.
+                        */
+                       block = chunk->md_blocks + i;
+                       if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
+                           test_bit(i, chunk->populated)) {
+                               if (end == -1)
+                                       end = i;
+                               if (i > 0)
+                                       continue;
+                               i--;
+                       }
+ 
+                       /* depopulate if there is an active range */
+                       if (end == -1)
+                               continue;
+ 
+                       spin_unlock_irq(&pcpu_lock);
+                       pcpu_depopulate_chunk(chunk, i + 1, end + 1);
+                       cond_resched();
+                       spin_lock_irq(&pcpu_lock);
+ 
+                       pcpu_chunk_depopulated(chunk, i + 1, end + 1);
+ 
+                       /* reset the range and continue */
+                       end = -1;
+               }
+ 
+               if (chunk->free_bytes == pcpu_unit_size)
+                       pcpu_reintegrate_chunk(chunk);
+               else
+                       list_move(&chunk->list,
+                                 &pcpu_chunk_lists[pcpu_sidelined_slot]);
+       }
   }
   
   /**
    * pcpu_balance_workfn - manage the amount of free chunks and populated pages
    * @work: unused
    *
-  * Call __pcpu_balance_workfn() for each chunk type.
+  * For each chunk type, manage the number of fully free chunks and the number of
+  * populated pages.  An important thing to consider is when pages are freed and
+  * how they contribute to the global counts.
    */
   static void pcpu_balance_workfn(struct work_struct *work)
   {
-       enum pcpu_chunk_type type;
+       /*
+        * pcpu_balance_free() is called twice because the first time we may
+        * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
+        * to grow other chunks.  This then gives pcpu_reclaim_populated() time
+        * to move fully free chunks to the active list to be freed if
+        * appropriate.
+        */
+       mutex_lock(&pcpu_alloc_mutex);
+       spin_lock_irq(&pcpu_lock);
+ 
+       pcpu_balance_free(false);
+       pcpu_reclaim_populated();
+       pcpu_balance_populated();
+       pcpu_balance_free(true);
   
-       for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-               __pcpu_balance_workfn(type);
+       spin_unlock_irq(&pcpu_lock);
+       mutex_unlock(&pcpu_alloc_mutex);
   }
   
   /**
@@@ -2085,7 -2252,6 +2252,6 @@@ void free_percpu(void __percpu *ptr
         unsigned long flags;
         int size, off;
         bool need_balance = false;
-       struct list_head *pcpu_slot;
   
         if (!ptr)
                 return;
@@@ -2101,19 -2267,24 +2267,24 @@@
   
         size = pcpu_free_area(chunk, off);
   
-       pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
- 
         pcpu_memcg_free_hook(chunk, off, size);
   
-       /* if there are more than one fully free chunks, wake up grim reaper */
-       if (chunk->free_bytes == pcpu_unit_size) {
+       /*
+        * If there are more than one fully free chunks, wake up grim reaper.
+        * If the chunk is isolated, it may be in the process of being
+        * reclaimed.  Let reclaim manage cleaning up of that chunk.
+        */
+       if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                 struct pcpu_chunk *pos;
   
-               list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+               list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                         if (pos != chunk) {
                                 need_balance = true;
                                 break;
                         }
+       } else if (pcpu_should_reclaim_chunk(chunk)) {
+               pcpu_isolate_chunk(chunk);
+               need_balance = true;
         }
   
         trace_percpu_free_percpu(chunk->base_addr, off, ptr);
@@@ -2414,7 -2585,6 +2585,6 @@@ void __init pcpu_setup_first_chunk(cons
         int map_size;
         unsigned long tmp_addr;
         size_t alloc_size;
-       enum pcpu_chunk_type type;
   
   #define PCPU_SETUP_BUG_ON(cond)       do {                                    \
         if (unlikely(cond)) {                                           \
@@@ -2528,22 -2698,24 +2698,24 @@@
         pcpu_stats_save_ai(ai);
   
         /*
-        * Allocate chunk slots.  The additional last slot is for
-        * empty chunks.
+        * Allocate chunk slots.  The slots after the active slots are:
+        *   sidelined_slot - isolated, depopulated chunks
+        *   free_slot - fully free chunks
+        *   to_depopulate_slot - isolated, chunks to depopulate
          */
-       pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+       pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
+       pcpu_free_slot = pcpu_sidelined_slot + 1;
+       pcpu_to_depopulate_slot = pcpu_free_slot + 1;
+       pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
         pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
-                                         sizeof(pcpu_chunk_lists[0]) *
-                                         PCPU_NR_CHUNK_TYPES,
+                                         sizeof(pcpu_chunk_lists[0]),
                                           SMP_CACHE_BYTES);
         if (!pcpu_chunk_lists)
                 panic("%s: Failed to allocate %zu bytes\n", __func__,
-                     pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
-                     PCPU_NR_CHUNK_TYPES);
+                     pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
   
-       for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-               for (i = 0; i < pcpu_nr_slots; i++)
-                       INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
+       for (i = 0; i < pcpu_nr_slots; i++)
+               INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
   
         /*
          * The end of the static region needs to be aligned with the
@@@ -2580,7 -2752,7 +2752,7 @@@
   
         /* link the first chunk in */
         pcpu_first_chunk = chunk;
-       pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
+       pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
         pcpu_chunk_relocate(pcpu_first_chunk, -1);
   
         /* include all regions of the first chunk */
@@@ -2733,6 -2905,7 +2905,7 @@@ static struct pcpu_alloc_info * __init 
          * Related to atom_size, which could be much larger than the unit_size.
          */
         last_allocs = INT_MAX;
+       best_upa = 0;
         for (upa = max_upa; upa; upa--) {
                 int allocs = 0, wasted = 0;
   
@@@ -2759,6 -2932,7 +2932,7 @@@
                 last_allocs = allocs;
                 best_upa = upa;
         }
+       BUG_ON(!best_upa);
         upa = best_upa;
   
         /* allocate and fill alloc_info */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Jul 2021 00:17:24 +0000 (17:17 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Jul 2021 00:17:24 +0000 (17:17 -0700)
		1	2
include/linux/memcontrol.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/percpu-internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/percpu-vm.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history