1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* memcontrol.c - Memory Controller
4 * Copyright IBM Corporation, 2007
5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
7 * Copyright 2007 OpenVZ SWsoft Inc
8 * Author: Pavel Emelianov <xemul@openvz.org>
11 * Copyright (C) 2009 Nokia Corporation
12 * Author: Kirill A. Shutemov
14 * Kernel Memory Controller
15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
16 * Authors: Glauber Costa and Suleiman Souhlal
19 * Charge lifetime sanitation
20 * Lockless page tracking & accounting
21 * Unified hierarchy configuration model
22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
25 #include <linux/page_counter.h>
26 #include <linux/memcontrol.h>
27 #include <linux/cgroup.h>
29 #include <linux/sched/mm.h>
30 #include <linux/shmem_fs.h>
31 #include <linux/hugetlb.h>
32 #include <linux/pagemap.h>
33 #include <linux/vm_event_item.h>
34 #include <linux/smp.h>
35 #include <linux/page-flags.h>
36 #include <linux/backing-dev.h>
37 #include <linux/bit_spinlock.h>
38 #include <linux/rcupdate.h>
39 #include <linux/limits.h>
40 #include <linux/export.h>
41 #include <linux/mutex.h>
42 #include <linux/rbtree.h>
43 #include <linux/slab.h>
44 #include <linux/swap.h>
45 #include <linux/swapops.h>
46 #include <linux/spinlock.h>
47 #include <linux/eventfd.h>
48 #include <linux/poll.h>
49 #include <linux/sort.h>
51 #include <linux/seq_file.h>
52 #include <linux/vmpressure.h>
53 #include <linux/mm_inline.h>
54 #include <linux/swap_cgroup.h>
55 #include <linux/cpu.h>
56 #include <linux/oom.h>
57 #include <linux/lockdep.h>
58 #include <linux/file.h>
59 #include <linux/tracehook.h>
60 #include <linux/seq_buf.h>
66 #include <linux/uaccess.h>
68 #include <trace/events/vmscan.h>
70 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
71 EXPORT_SYMBOL(memory_cgrp_subsys);
73 struct mem_cgroup *root_mem_cgroup __read_mostly;
75 #define MEM_CGROUP_RECLAIM_RETRIES 5
77 /* Socket memory accounting disabled? */
78 static bool cgroup_memory_nosocket;
80 /* Kernel memory accounting disabled? */
81 static bool cgroup_memory_nokmem;
83 /* Whether the swap controller is active */
84 #ifdef CONFIG_MEMCG_SWAP
85 int do_swap_account __read_mostly;
87 #define do_swap_account 0
90 /* Whether legacy memory+swap accounting is active */
91 static bool do_memsw_account(void)
93 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
96 static const char *const mem_cgroup_lru_names[] = {
104 #define THRESHOLDS_EVENTS_TARGET 128
105 #define SOFTLIMIT_EVENTS_TARGET 1024
106 #define NUMAINFO_EVENTS_TARGET 1024
109 * Cgroups above their limits are maintained in a RB-Tree, independent of
110 * their hierarchy representation
113 struct mem_cgroup_tree_per_node {
114 struct rb_root rb_root;
115 struct rb_node *rb_rightmost;
119 struct mem_cgroup_tree {
120 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
123 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
126 struct mem_cgroup_eventfd_list {
127 struct list_head list;
128 struct eventfd_ctx *eventfd;
132 * cgroup_event represents events which userspace want to receive.
134 struct mem_cgroup_event {
136 * memcg which the event belongs to.
138 struct mem_cgroup *memcg;
140 * eventfd to signal userspace about the event.
142 struct eventfd_ctx *eventfd;
144 * Each of these stored in a list by the cgroup.
146 struct list_head list;
148 * register_event() callback will be used to add new userspace
149 * waiter for changes related to this event. Use eventfd_signal()
150 * on eventfd to send notification to userspace.
152 int (*register_event)(struct mem_cgroup *memcg,
153 struct eventfd_ctx *eventfd, const char *args);
155 * unregister_event() callback will be called when userspace closes
156 * the eventfd or on cgroup removing. This callback must be set,
157 * if you want provide notification functionality.
159 void (*unregister_event)(struct mem_cgroup *memcg,
160 struct eventfd_ctx *eventfd);
162 * All fields below needed to unregister event when
163 * userspace closes eventfd.
166 wait_queue_head_t *wqh;
167 wait_queue_entry_t wait;
168 struct work_struct remove;
171 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
172 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
174 /* Stuffs for move charges at task migration. */
176 * Types of charges to be moved.
178 #define MOVE_ANON 0x1U
179 #define MOVE_FILE 0x2U
180 #define MOVE_MASK (MOVE_ANON | MOVE_FILE)
182 /* "mc" and its members are protected by cgroup_mutex */
183 static struct move_charge_struct {
184 spinlock_t lock; /* for from, to */
185 struct mm_struct *mm;
186 struct mem_cgroup *from;
187 struct mem_cgroup *to;
189 unsigned long precharge;
190 unsigned long moved_charge;
191 unsigned long moved_swap;
192 struct task_struct *moving_task; /* a task moving charges */
193 wait_queue_head_t waitq; /* a waitq for other context */
195 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
196 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
200 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
201 * limit reclaim to prevent infinite loops, if they ever occur.
203 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
204 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
207 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
208 MEM_CGROUP_CHARGE_TYPE_ANON,
209 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
210 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
214 /* for encoding cft->private value on file */
223 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
224 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
225 #define MEMFILE_ATTR(val) ((val) & 0xffff)
226 /* Used for OOM nofiier */
227 #define OOM_CONTROL (0)
230 * Iteration constructs for visiting all cgroups (under a tree). If
231 * loops are exited prematurely (break), mem_cgroup_iter_break() must
232 * be used for reference counting.
234 #define for_each_mem_cgroup_tree(iter, root) \
235 for (iter = mem_cgroup_iter(root, NULL, NULL); \
237 iter = mem_cgroup_iter(root, iter, NULL))
239 #define for_each_mem_cgroup(iter) \
240 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
242 iter = mem_cgroup_iter(NULL, iter, NULL))
244 static inline bool should_force_charge(void)
246 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
247 (current->flags & PF_EXITING);
250 /* Some nice accessors for the vmpressure. */
251 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
254 memcg = root_mem_cgroup;
255 return &memcg->vmpressure;
258 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
260 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
263 #ifdef CONFIG_MEMCG_KMEM
265 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
266 * The main reason for not using cgroup id for this:
267 * this works better in sparse environments, where we have a lot of memcgs,
268 * but only a few kmem-limited. Or also, if we have, for instance, 200
269 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
270 * 200 entry array for that.
272 * The current size of the caches array is stored in memcg_nr_cache_ids. It
273 * will double each time we have to increase it.
275 static DEFINE_IDA(memcg_cache_ida);
276 int memcg_nr_cache_ids;
278 /* Protects memcg_nr_cache_ids */
279 static DECLARE_RWSEM(memcg_cache_ids_sem);
281 void memcg_get_cache_ids(void)
283 down_read(&memcg_cache_ids_sem);
286 void memcg_put_cache_ids(void)
288 up_read(&memcg_cache_ids_sem);
292 * MIN_SIZE is different than 1, because we would like to avoid going through
293 * the alloc/free process all the time. In a small machine, 4 kmem-limited
294 * cgroups is a reasonable guess. In the future, it could be a parameter or
295 * tunable, but that is strictly not necessary.
297 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
298 * this constant directly from cgroup, but it is understandable that this is
299 * better kept as an internal representation in cgroup.c. In any case, the
300 * cgrp_id space is not getting any smaller, and we don't have to necessarily
301 * increase ours as well if it increases.
303 #define MEMCG_CACHES_MIN_SIZE 4
304 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
307 * A lot of the calls to the cache allocation functions are expected to be
308 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
309 * conditional to this static branch, we'll have to allow modules that does
310 * kmem_cache_alloc and the such to see this symbol as well
312 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
313 EXPORT_SYMBOL(memcg_kmem_enabled_key);
315 struct workqueue_struct *memcg_kmem_cache_wq;
317 static int memcg_shrinker_map_size;
318 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
320 static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
322 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
325 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
326 int size, int old_size)
328 struct memcg_shrinker_map *new, *old;
331 lockdep_assert_held(&memcg_shrinker_map_mutex);
334 old = rcu_dereference_protected(
335 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
336 /* Not yet online memcg */
340 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
344 /* Set all old bits, clear all new bits */
345 memset(new->map, (int)0xff, old_size);
346 memset((void *)new->map + old_size, 0, size - old_size);
348 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
349 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
355 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
357 struct mem_cgroup_per_node *pn;
358 struct memcg_shrinker_map *map;
361 if (mem_cgroup_is_root(memcg))
365 pn = mem_cgroup_nodeinfo(memcg, nid);
366 map = rcu_dereference_protected(pn->shrinker_map, true);
369 rcu_assign_pointer(pn->shrinker_map, NULL);
373 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
375 struct memcg_shrinker_map *map;
376 int nid, size, ret = 0;
378 if (mem_cgroup_is_root(memcg))
381 mutex_lock(&memcg_shrinker_map_mutex);
382 size = memcg_shrinker_map_size;
384 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
386 memcg_free_shrinker_maps(memcg);
390 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
392 mutex_unlock(&memcg_shrinker_map_mutex);
397 int memcg_expand_shrinker_maps(int new_id)
399 int size, old_size, ret = 0;
400 struct mem_cgroup *memcg;
402 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
403 old_size = memcg_shrinker_map_size;
404 if (size <= old_size)
407 mutex_lock(&memcg_shrinker_map_mutex);
408 if (!root_mem_cgroup)
411 for_each_mem_cgroup(memcg) {
412 if (mem_cgroup_is_root(memcg))
414 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
420 memcg_shrinker_map_size = size;
421 mutex_unlock(&memcg_shrinker_map_mutex);
425 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
427 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
428 struct memcg_shrinker_map *map;
431 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
432 /* Pairs with smp mb in shrink_slab() */
433 smp_mb__before_atomic();
434 set_bit(shrinker_id, map->map);
439 #else /* CONFIG_MEMCG_KMEM */
440 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
444 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
445 #endif /* CONFIG_MEMCG_KMEM */
448 * mem_cgroup_css_from_page - css of the memcg associated with a page
449 * @page: page of interest
451 * If memcg is bound to the default hierarchy, css of the memcg associated
452 * with @page is returned. The returned css remains associated with @page
453 * until it is released.
455 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
458 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
460 struct mem_cgroup *memcg;
462 memcg = page->mem_cgroup;
464 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
465 memcg = root_mem_cgroup;
471 * page_cgroup_ino - return inode number of the memcg a page is charged to
474 * Look up the closest online ancestor of the memory cgroup @page is charged to
475 * and return its inode number or 0 if @page is not charged to any cgroup. It
476 * is safe to call this function without holding a reference to @page.
478 * Note, this function is inherently racy, because there is nothing to prevent
479 * the cgroup inode from getting torn down and potentially reallocated a moment
480 * after page_cgroup_ino() returns, so it only should be used by callers that
481 * do not care (such as procfs interfaces).
483 ino_t page_cgroup_ino(struct page *page)
485 struct mem_cgroup *memcg;
486 unsigned long ino = 0;
489 if (PageHead(page) && PageSlab(page))
490 memcg = memcg_from_slab_page(page);
492 memcg = READ_ONCE(page->mem_cgroup);
493 while (memcg && !(memcg->css.flags & CSS_ONLINE))
494 memcg = parent_mem_cgroup(memcg);
496 ino = cgroup_ino(memcg->css.cgroup);
501 static struct mem_cgroup_per_node *
502 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
504 int nid = page_to_nid(page);
506 return memcg->nodeinfo[nid];
509 static struct mem_cgroup_tree_per_node *
510 soft_limit_tree_node(int nid)
512 return soft_limit_tree.rb_tree_per_node[nid];
515 static struct mem_cgroup_tree_per_node *
516 soft_limit_tree_from_page(struct page *page)
518 int nid = page_to_nid(page);
520 return soft_limit_tree.rb_tree_per_node[nid];
523 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
524 struct mem_cgroup_tree_per_node *mctz,
525 unsigned long new_usage_in_excess)
527 struct rb_node **p = &mctz->rb_root.rb_node;
528 struct rb_node *parent = NULL;
529 struct mem_cgroup_per_node *mz_node;
530 bool rightmost = true;
535 mz->usage_in_excess = new_usage_in_excess;
536 if (!mz->usage_in_excess)
540 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
542 if (mz->usage_in_excess < mz_node->usage_in_excess) {
548 * We can't avoid mem cgroups that are over their soft
549 * limit by the same amount
551 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
556 mctz->rb_rightmost = &mz->tree_node;
558 rb_link_node(&mz->tree_node, parent, p);
559 rb_insert_color(&mz->tree_node, &mctz->rb_root);
563 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
564 struct mem_cgroup_tree_per_node *mctz)
569 if (&mz->tree_node == mctz->rb_rightmost)
570 mctz->rb_rightmost = rb_prev(&mz->tree_node);
572 rb_erase(&mz->tree_node, &mctz->rb_root);
576 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
577 struct mem_cgroup_tree_per_node *mctz)
581 spin_lock_irqsave(&mctz->lock, flags);
582 __mem_cgroup_remove_exceeded(mz, mctz);
583 spin_unlock_irqrestore(&mctz->lock, flags);
586 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
588 unsigned long nr_pages = page_counter_read(&memcg->memory);
589 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
590 unsigned long excess = 0;
592 if (nr_pages > soft_limit)
593 excess = nr_pages - soft_limit;
598 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
600 unsigned long excess;
601 struct mem_cgroup_per_node *mz;
602 struct mem_cgroup_tree_per_node *mctz;
604 mctz = soft_limit_tree_from_page(page);
608 * Necessary to update all ancestors when hierarchy is used.
609 * because their event counter is not touched.
611 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
612 mz = mem_cgroup_page_nodeinfo(memcg, page);
613 excess = soft_limit_excess(memcg);
615 * We have to update the tree if mz is on RB-tree or
616 * mem is over its softlimit.
618 if (excess || mz->on_tree) {
621 spin_lock_irqsave(&mctz->lock, flags);
622 /* if on-tree, remove it */
624 __mem_cgroup_remove_exceeded(mz, mctz);
626 * Insert again. mz->usage_in_excess will be updated.
627 * If excess is 0, no tree ops.
629 __mem_cgroup_insert_exceeded(mz, mctz, excess);
630 spin_unlock_irqrestore(&mctz->lock, flags);
635 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
637 struct mem_cgroup_tree_per_node *mctz;
638 struct mem_cgroup_per_node *mz;
642 mz = mem_cgroup_nodeinfo(memcg, nid);
643 mctz = soft_limit_tree_node(nid);
645 mem_cgroup_remove_exceeded(mz, mctz);
649 static struct mem_cgroup_per_node *
650 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
652 struct mem_cgroup_per_node *mz;
656 if (!mctz->rb_rightmost)
657 goto done; /* Nothing to reclaim from */
659 mz = rb_entry(mctz->rb_rightmost,
660 struct mem_cgroup_per_node, tree_node);
662 * Remove the node now but someone else can add it back,
663 * we will to add it back at the end of reclaim to its correct
664 * position in the tree.
666 __mem_cgroup_remove_exceeded(mz, mctz);
667 if (!soft_limit_excess(mz->memcg) ||
668 !css_tryget_online(&mz->memcg->css))
674 static struct mem_cgroup_per_node *
675 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
677 struct mem_cgroup_per_node *mz;
679 spin_lock_irq(&mctz->lock);
680 mz = __mem_cgroup_largest_soft_limit_node(mctz);
681 spin_unlock_irq(&mctz->lock);
686 * __mod_memcg_state - update cgroup memory statistics
687 * @memcg: the memory cgroup
688 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
689 * @val: delta to add to the counter, can be negative
691 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
695 if (mem_cgroup_disabled())
698 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
699 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
700 struct mem_cgroup *mi;
703 * Batch local counters to keep them in sync with
704 * the hierarchical ones.
706 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
707 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
708 atomic_long_add(x, &mi->vmstats[idx]);
711 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
714 static struct mem_cgroup_per_node *
715 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
717 struct mem_cgroup *parent;
719 parent = parent_mem_cgroup(pn->memcg);
722 return mem_cgroup_nodeinfo(parent, nid);
726 * __mod_lruvec_state - update lruvec memory statistics
727 * @lruvec: the lruvec
728 * @idx: the stat item
729 * @val: delta to add to the counter, can be negative
731 * The lruvec is the intersection of the NUMA node and a cgroup. This
732 * function updates the all three counters that are affected by a
733 * change of state at this level: per-node, per-cgroup, per-lruvec.
735 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
738 pg_data_t *pgdat = lruvec_pgdat(lruvec);
739 struct mem_cgroup_per_node *pn;
740 struct mem_cgroup *memcg;
744 __mod_node_page_state(pgdat, idx, val);
746 if (mem_cgroup_disabled())
749 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
753 __mod_memcg_state(memcg, idx, val);
755 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
756 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
757 struct mem_cgroup_per_node *pi;
760 * Batch local counters to keep them in sync with
761 * the hierarchical ones.
763 __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
764 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
765 atomic_long_add(x, &pi->lruvec_stat[idx]);
768 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
771 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
773 struct page *page = virt_to_head_page(p);
774 pg_data_t *pgdat = page_pgdat(page);
775 struct mem_cgroup *memcg;
776 struct lruvec *lruvec;
779 memcg = memcg_from_slab_page(page);
781 /* Untracked pages have no memcg, no lruvec. Update only the node */
782 if (!memcg || memcg == root_mem_cgroup) {
783 __mod_node_page_state(pgdat, idx, val);
785 lruvec = mem_cgroup_lruvec(pgdat, memcg);
786 __mod_lruvec_state(lruvec, idx, val);
792 * __count_memcg_events - account VM events in a cgroup
793 * @memcg: the memory cgroup
794 * @idx: the event item
795 * @count: the number of events that occured
797 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
802 if (mem_cgroup_disabled())
805 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
806 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
807 struct mem_cgroup *mi;
810 * Batch local counters to keep them in sync with
811 * the hierarchical ones.
813 __this_cpu_add(memcg->vmstats_local->events[idx], x);
814 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
815 atomic_long_add(x, &mi->vmevents[idx]);
818 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
821 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
823 return atomic_long_read(&memcg->vmevents[event]);
826 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
831 for_each_possible_cpu(cpu)
832 x += per_cpu(memcg->vmstats_local->events[event], cpu);
836 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
838 bool compound, int nr_pages)
841 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
842 * counted as CACHE even if it's on ANON LRU.
845 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
847 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
848 if (PageSwapBacked(page))
849 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
853 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
854 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
857 /* pagein of a big page is an event. So, ignore page size */
859 __count_memcg_events(memcg, PGPGIN, 1);
861 __count_memcg_events(memcg, PGPGOUT, 1);
862 nr_pages = -nr_pages; /* for event */
865 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
868 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
869 enum mem_cgroup_events_target target)
871 unsigned long val, next;
873 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
874 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
875 /* from time_after() in jiffies.h */
876 if ((long)(next - val) < 0) {
878 case MEM_CGROUP_TARGET_THRESH:
879 next = val + THRESHOLDS_EVENTS_TARGET;
881 case MEM_CGROUP_TARGET_SOFTLIMIT:
882 next = val + SOFTLIMIT_EVENTS_TARGET;
884 case MEM_CGROUP_TARGET_NUMAINFO:
885 next = val + NUMAINFO_EVENTS_TARGET;
890 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
897 * Check events in order.
900 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
902 /* threshold event is triggered in finer grain than soft limit */
903 if (unlikely(mem_cgroup_event_ratelimit(memcg,
904 MEM_CGROUP_TARGET_THRESH))) {
906 bool do_numainfo __maybe_unused;
908 do_softlimit = mem_cgroup_event_ratelimit(memcg,
909 MEM_CGROUP_TARGET_SOFTLIMIT);
911 do_numainfo = mem_cgroup_event_ratelimit(memcg,
912 MEM_CGROUP_TARGET_NUMAINFO);
914 mem_cgroup_threshold(memcg);
915 if (unlikely(do_softlimit))
916 mem_cgroup_update_tree(memcg, page);
918 if (unlikely(do_numainfo))
919 atomic_inc(&memcg->numainfo_events);
924 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
927 * mm_update_next_owner() may clear mm->owner to NULL
928 * if it races with swapoff, page migration, etc.
929 * So this can be called with p == NULL.
934 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
936 EXPORT_SYMBOL(mem_cgroup_from_task);
939 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
940 * @mm: mm from which memcg should be extracted. It can be NULL.
942 * Obtain a reference on mm->memcg and returns it if successful. Otherwise
943 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
946 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
948 struct mem_cgroup *memcg;
950 if (mem_cgroup_disabled())
956 * Page cache insertions can happen withou an
957 * actual mm context, e.g. during disk probing
958 * on boot, loopback IO, acct() writes etc.
961 memcg = root_mem_cgroup;
963 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
964 if (unlikely(!memcg))
965 memcg = root_mem_cgroup;
967 } while (!css_tryget_online(&memcg->css));
971 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
974 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
975 * @page: page from which memcg should be extracted.
977 * Obtain a reference on page->memcg and returns it if successful. Otherwise
978 * root_mem_cgroup is returned.
980 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
982 struct mem_cgroup *memcg = page->mem_cgroup;
984 if (mem_cgroup_disabled())
988 if (!memcg || !css_tryget_online(&memcg->css))
989 memcg = root_mem_cgroup;
993 EXPORT_SYMBOL(get_mem_cgroup_from_page);
996 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
998 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
1000 if (unlikely(current->active_memcg)) {
1001 struct mem_cgroup *memcg = root_mem_cgroup;
1004 if (css_tryget_online(¤t->active_memcg->css))
1005 memcg = current->active_memcg;
1009 return get_mem_cgroup_from_mm(current->mm);
1013 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1014 * @root: hierarchy root
1015 * @prev: previously returned memcg, NULL on first invocation
1016 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1018 * Returns references to children of the hierarchy below @root, or
1019 * @root itself, or %NULL after a full round-trip.
1021 * Caller must pass the return value in @prev on subsequent
1022 * invocations for reference counting, or use mem_cgroup_iter_break()
1023 * to cancel a hierarchy walk before the round-trip is complete.
1025 * Reclaimers can specify a node and a priority level in @reclaim to
1026 * divide up the memcgs in the hierarchy among all concurrent
1027 * reclaimers operating on the same node and priority.
1029 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1030 struct mem_cgroup *prev,
1031 struct mem_cgroup_reclaim_cookie *reclaim)
1033 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1034 struct cgroup_subsys_state *css = NULL;
1035 struct mem_cgroup *memcg = NULL;
1036 struct mem_cgroup *pos = NULL;
1038 if (mem_cgroup_disabled())
1042 root = root_mem_cgroup;
1044 if (prev && !reclaim)
1047 if (!root->use_hierarchy && root != root_mem_cgroup) {
1056 struct mem_cgroup_per_node *mz;
1058 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1059 iter = &mz->iter[reclaim->priority];
1061 if (prev && reclaim->generation != iter->generation)
1065 pos = READ_ONCE(iter->position);
1066 if (!pos || css_tryget(&pos->css))
1069 * css reference reached zero, so iter->position will
1070 * be cleared by ->css_released. However, we should not
1071 * rely on this happening soon, because ->css_released
1072 * is called from a work queue, and by busy-waiting we
1073 * might block it. So we clear iter->position right
1076 (void)cmpxchg(&iter->position, pos, NULL);
1084 css = css_next_descendant_pre(css, &root->css);
1087 * Reclaimers share the hierarchy walk, and a
1088 * new one might jump in right at the end of
1089 * the hierarchy - make sure they see at least
1090 * one group and restart from the beginning.
1098 * Verify the css and acquire a reference. The root
1099 * is provided by the caller, so we know it's alive
1100 * and kicking, and don't take an extra reference.
1102 memcg = mem_cgroup_from_css(css);
1104 if (css == &root->css)
1107 if (css_tryget(css))
1115 * The position could have already been updated by a competing
1116 * thread, so check that the value hasn't changed since we read
1117 * it to avoid reclaiming from the same cgroup twice.
1119 (void)cmpxchg(&iter->position, pos, memcg);
1127 reclaim->generation = iter->generation;
1133 if (prev && prev != root)
1134 css_put(&prev->css);
1140 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1141 * @root: hierarchy root
1142 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1144 void mem_cgroup_iter_break(struct mem_cgroup *root,
1145 struct mem_cgroup *prev)
1148 root = root_mem_cgroup;
1149 if (prev && prev != root)
1150 css_put(&prev->css);
1153 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1154 struct mem_cgroup *dead_memcg)
1156 struct mem_cgroup_reclaim_iter *iter;
1157 struct mem_cgroup_per_node *mz;
1161 for_each_node(nid) {
1162 mz = mem_cgroup_nodeinfo(from, nid);
1163 for (i = 0; i <= DEF_PRIORITY; i++) {
1164 iter = &mz->iter[i];
1165 cmpxchg(&iter->position,
1171 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1173 struct mem_cgroup *memcg = dead_memcg;
1174 struct mem_cgroup *last;
1177 __invalidate_reclaim_iterators(memcg, dead_memcg);
1179 } while ((memcg = parent_mem_cgroup(memcg)));
1182 * When cgruop1 non-hierarchy mode is used,
1183 * parent_mem_cgroup() does not walk all the way up to the
1184 * cgroup root (root_mem_cgroup). So we have to handle
1185 * dead_memcg from cgroup root separately.
1187 if (last != root_mem_cgroup)
1188 __invalidate_reclaim_iterators(root_mem_cgroup,
1193 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1194 * @memcg: hierarchy root
1195 * @fn: function to call for each task
1196 * @arg: argument passed to @fn
1198 * This function iterates over tasks attached to @memcg or to any of its
1199 * descendants and calls @fn for each task. If @fn returns a non-zero
1200 * value, the function breaks the iteration loop and returns the value.
1201 * Otherwise, it will iterate over all tasks and return 0.
1203 * This function must not be called for the root memory cgroup.
1205 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1206 int (*fn)(struct task_struct *, void *), void *arg)
1208 struct mem_cgroup *iter;
1211 BUG_ON(memcg == root_mem_cgroup);
1213 for_each_mem_cgroup_tree(iter, memcg) {
1214 struct css_task_iter it;
1215 struct task_struct *task;
1217 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1218 while (!ret && (task = css_task_iter_next(&it)))
1219 ret = fn(task, arg);
1220 css_task_iter_end(&it);
1222 mem_cgroup_iter_break(memcg, iter);
1230 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1232 * @pgdat: pgdat of the page
1234 * This function is only safe when following the LRU page isolation
1235 * and putback protocol: the LRU lock must be held, and the page must
1236 * either be PageLRU() or the caller must have isolated/allocated it.
1238 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1240 struct mem_cgroup_per_node *mz;
1241 struct mem_cgroup *memcg;
1242 struct lruvec *lruvec;
1244 if (mem_cgroup_disabled()) {
1245 lruvec = &pgdat->lruvec;
1249 memcg = page->mem_cgroup;
1251 * Swapcache readahead pages are added to the LRU - and
1252 * possibly migrated - before they are charged.
1255 memcg = root_mem_cgroup;
1257 mz = mem_cgroup_page_nodeinfo(memcg, page);
1258 lruvec = &mz->lruvec;
1261 * Since a node can be onlined after the mem_cgroup was created,
1262 * we have to be prepared to initialize lruvec->zone here;
1263 * and if offlined then reonlined, we need to reinitialize it.
1265 if (unlikely(lruvec->pgdat != pgdat))
1266 lruvec->pgdat = pgdat;
1271 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1272 * @lruvec: mem_cgroup per zone lru vector
1273 * @lru: index of lru list the page is sitting on
1274 * @zid: zone id of the accounted pages
1275 * @nr_pages: positive when adding or negative when removing
1277 * This function must be called under lru_lock, just before a page is added
1278 * to or just after a page is removed from an lru list (that ordering being
1279 * so as to allow it to check that lru_size 0 is consistent with list_empty).
1281 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1282 int zid, int nr_pages)
1284 struct mem_cgroup_per_node *mz;
1285 unsigned long *lru_size;
1288 if (mem_cgroup_disabled())
1291 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1292 lru_size = &mz->lru_zone_size[zid][lru];
1295 *lru_size += nr_pages;
1298 if (WARN_ONCE(size < 0,
1299 "%s(%p, %d, %d): lru_size %ld\n",
1300 __func__, lruvec, lru, nr_pages, size)) {
1306 *lru_size += nr_pages;
1310 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1311 * @memcg: the memory cgroup
1313 * Returns the maximum amount of memory @mem can be charged with, in
1316 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1318 unsigned long margin = 0;
1319 unsigned long count;
1320 unsigned long limit;
1322 count = page_counter_read(&memcg->memory);
1323 limit = READ_ONCE(memcg->memory.max);
1325 margin = limit - count;
1327 if (do_memsw_account()) {
1328 count = page_counter_read(&memcg->memsw);
1329 limit = READ_ONCE(memcg->memsw.max);
1331 margin = min(margin, limit - count);
1340 * A routine for checking "mem" is under move_account() or not.
1342 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1343 * moving cgroups. This is for waiting at high-memory pressure
1346 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1348 struct mem_cgroup *from;
1349 struct mem_cgroup *to;
1352 * Unlike task_move routines, we access mc.to, mc.from not under
1353 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1355 spin_lock(&mc.lock);
1361 ret = mem_cgroup_is_descendant(from, memcg) ||
1362 mem_cgroup_is_descendant(to, memcg);
1364 spin_unlock(&mc.lock);
1368 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1370 if (mc.moving_task && current != mc.moving_task) {
1371 if (mem_cgroup_under_move(memcg)) {
1373 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1374 /* moving charge context might have finished. */
1377 finish_wait(&mc.waitq, &wait);
1384 static char *memory_stat_format(struct mem_cgroup *memcg)
1389 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1394 * Provide statistics on the state of the memory subsystem as
1395 * well as cumulative event counters that show past behavior.
1397 * This list is ordered following a combination of these gradients:
1398 * 1) generic big picture -> specifics and details
1399 * 2) reflecting userspace activity -> reflecting kernel heuristics
1401 * Current memory state:
1404 seq_buf_printf(&s, "anon %llu\n",
1405 (u64)memcg_page_state(memcg, MEMCG_RSS) *
1407 seq_buf_printf(&s, "file %llu\n",
1408 (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1410 seq_buf_printf(&s, "kernel_stack %llu\n",
1411 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1413 seq_buf_printf(&s, "slab %llu\n",
1414 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1415 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1417 seq_buf_printf(&s, "sock %llu\n",
1418 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1421 seq_buf_printf(&s, "shmem %llu\n",
1422 (u64)memcg_page_state(memcg, NR_SHMEM) *
1424 seq_buf_printf(&s, "file_mapped %llu\n",
1425 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1427 seq_buf_printf(&s, "file_dirty %llu\n",
1428 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1430 seq_buf_printf(&s, "file_writeback %llu\n",
1431 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1435 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
1436 * with the NR_ANON_THP vm counter, but right now it's a pain in the
1437 * arse because it requires migrating the work out of rmap to a place
1438 * where the page->mem_cgroup is set up and stable.
1440 seq_buf_printf(&s, "anon_thp %llu\n",
1441 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1444 for (i = 0; i < NR_LRU_LISTS; i++)
1445 seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
1446 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1449 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1450 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1452 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1453 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1456 /* Accumulated memory events */
1458 seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
1459 seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
1461 seq_buf_printf(&s, "workingset_refault %lu\n",
1462 memcg_page_state(memcg, WORKINGSET_REFAULT));
1463 seq_buf_printf(&s, "workingset_activate %lu\n",
1464 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1465 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1466 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1468 seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
1469 seq_buf_printf(&s, "pgscan %lu\n",
1470 memcg_events(memcg, PGSCAN_KSWAPD) +
1471 memcg_events(memcg, PGSCAN_DIRECT));
1472 seq_buf_printf(&s, "pgsteal %lu\n",
1473 memcg_events(memcg, PGSTEAL_KSWAPD) +
1474 memcg_events(memcg, PGSTEAL_DIRECT));
1475 seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
1476 seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
1477 seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
1478 seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
1480 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1481 seq_buf_printf(&s, "thp_fault_alloc %lu\n",
1482 memcg_events(memcg, THP_FAULT_ALLOC));
1483 seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
1484 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1485 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1487 /* The above should easily fit into one page */
1488 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1493 #define K(x) ((x) << (PAGE_SHIFT-10))
1495 * mem_cgroup_print_oom_context: Print OOM information relevant to
1496 * memory controller.
1497 * @memcg: The memory cgroup that went over limit
1498 * @p: Task that is going to be killed
1500 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1503 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1508 pr_cont(",oom_memcg=");
1509 pr_cont_cgroup_path(memcg->css.cgroup);
1511 pr_cont(",global_oom");
1513 pr_cont(",task_memcg=");
1514 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1520 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1521 * memory controller.
1522 * @memcg: The memory cgroup that went over limit
1524 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1528 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1529 K((u64)page_counter_read(&memcg->memory)),
1530 K((u64)memcg->memory.max), memcg->memory.failcnt);
1531 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1532 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1533 K((u64)page_counter_read(&memcg->swap)),
1534 K((u64)memcg->swap.max), memcg->swap.failcnt);
1536 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1537 K((u64)page_counter_read(&memcg->memsw)),
1538 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1539 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1540 K((u64)page_counter_read(&memcg->kmem)),
1541 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1544 pr_info("Memory cgroup stats for ");
1545 pr_cont_cgroup_path(memcg->css.cgroup);
1547 buf = memory_stat_format(memcg);
1555 * Return the memory (and swap, if configured) limit for a memcg.
1557 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1561 max = memcg->memory.max;
1562 if (mem_cgroup_swappiness(memcg)) {
1563 unsigned long memsw_max;
1564 unsigned long swap_max;
1566 memsw_max = memcg->memsw.max;
1567 swap_max = memcg->swap.max;
1568 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1569 max = min(max + swap_max, memsw_max);
1574 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1577 struct oom_control oc = {
1581 .gfp_mask = gfp_mask,
1586 if (mutex_lock_killable(&oom_lock))
1589 * A few threads which were not waiting at mutex_lock_killable() can
1590 * fail to bail out. Therefore, check again after holding oom_lock.
1592 ret = should_force_charge() || out_of_memory(&oc);
1593 mutex_unlock(&oom_lock);
1597 #if MAX_NUMNODES > 1
1600 * test_mem_cgroup_node_reclaimable
1601 * @memcg: the target memcg
1602 * @nid: the node ID to be checked.
1603 * @noswap : specify true here if the user wants flle only information.
1605 * This function returns whether the specified memcg contains any
1606 * reclaimable pages on a node. Returns true if there are any reclaimable
1607 * pages in the node.
1609 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1610 int nid, bool noswap)
1612 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
1614 if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
1615 lruvec_page_state(lruvec, NR_ACTIVE_FILE))
1617 if (noswap || !total_swap_pages)
1619 if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
1620 lruvec_page_state(lruvec, NR_ACTIVE_ANON))
1627 * Always updating the nodemask is not very good - even if we have an empty
1628 * list or the wrong list here, we can start from some node and traverse all
1629 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1632 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1636 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1637 * pagein/pageout changes since the last update.
1639 if (!atomic_read(&memcg->numainfo_events))
1641 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1644 /* make a nodemask where this memcg uses memory from */
1645 memcg->scan_nodes = node_states[N_MEMORY];
1647 for_each_node_mask(nid, node_states[N_MEMORY]) {
1649 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1650 node_clear(nid, memcg->scan_nodes);
1653 atomic_set(&memcg->numainfo_events, 0);
1654 atomic_set(&memcg->numainfo_updating, 0);
1658 * Selecting a node where we start reclaim from. Because what we need is just
1659 * reducing usage counter, start from anywhere is O,K. Considering
1660 * memory reclaim from current node, there are pros. and cons.
1662 * Freeing memory from current node means freeing memory from a node which
1663 * we'll use or we've used. So, it may make LRU bad. And if several threads
1664 * hit limits, it will see a contention on a node. But freeing from remote
1665 * node means more costs for memory reclaim because of memory latency.
1667 * Now, we use round-robin. Better algorithm is welcomed.
1669 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1673 mem_cgroup_may_update_nodemask(memcg);
1674 node = memcg->last_scanned_node;
1676 node = next_node_in(node, memcg->scan_nodes);
1678 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1679 * last time it really checked all the LRUs due to rate limiting.
1680 * Fallback to the current node in that case for simplicity.
1682 if (unlikely(node == MAX_NUMNODES))
1683 node = numa_node_id();
1685 memcg->last_scanned_node = node;
1689 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1695 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1698 unsigned long *total_scanned)
1700 struct mem_cgroup *victim = NULL;
1703 unsigned long excess;
1704 unsigned long nr_scanned;
1705 struct mem_cgroup_reclaim_cookie reclaim = {
1710 excess = soft_limit_excess(root_memcg);
1713 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1718 * If we have not been able to reclaim
1719 * anything, it might because there are
1720 * no reclaimable pages under this hierarchy
1725 * We want to do more targeted reclaim.
1726 * excess >> 2 is not to excessive so as to
1727 * reclaim too much, nor too less that we keep
1728 * coming back to reclaim from this cgroup
1730 if (total >= (excess >> 2) ||
1731 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1736 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1737 pgdat, &nr_scanned);
1738 *total_scanned += nr_scanned;
1739 if (!soft_limit_excess(root_memcg))
1742 mem_cgroup_iter_break(root_memcg, victim);
1746 #ifdef CONFIG_LOCKDEP
1747 static struct lockdep_map memcg_oom_lock_dep_map = {
1748 .name = "memcg_oom_lock",
1752 static DEFINE_SPINLOCK(memcg_oom_lock);
1755 * Check OOM-Killer is already running under our hierarchy.
1756 * If someone is running, return false.
1758 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1760 struct mem_cgroup *iter, *failed = NULL;
1762 spin_lock(&memcg_oom_lock);
1764 for_each_mem_cgroup_tree(iter, memcg) {
1765 if (iter->oom_lock) {
1767 * this subtree of our hierarchy is already locked
1768 * so we cannot give a lock.
1771 mem_cgroup_iter_break(memcg, iter);
1774 iter->oom_lock = true;
1779 * OK, we failed to lock the whole subtree so we have
1780 * to clean up what we set up to the failing subtree
1782 for_each_mem_cgroup_tree(iter, memcg) {
1783 if (iter == failed) {
1784 mem_cgroup_iter_break(memcg, iter);
1787 iter->oom_lock = false;
1790 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1792 spin_unlock(&memcg_oom_lock);
1797 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1799 struct mem_cgroup *iter;
1801 spin_lock(&memcg_oom_lock);
1802 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1803 for_each_mem_cgroup_tree(iter, memcg)
1804 iter->oom_lock = false;
1805 spin_unlock(&memcg_oom_lock);
1808 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1810 struct mem_cgroup *iter;
1812 spin_lock(&memcg_oom_lock);
1813 for_each_mem_cgroup_tree(iter, memcg)
1815 spin_unlock(&memcg_oom_lock);
1818 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1820 struct mem_cgroup *iter;
1823 * When a new child is created while the hierarchy is under oom,
1824 * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1826 spin_lock(&memcg_oom_lock);
1827 for_each_mem_cgroup_tree(iter, memcg)
1828 if (iter->under_oom > 0)
1830 spin_unlock(&memcg_oom_lock);
1833 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1835 struct oom_wait_info {
1836 struct mem_cgroup *memcg;
1837 wait_queue_entry_t wait;
1840 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1841 unsigned mode, int sync, void *arg)
1843 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1844 struct mem_cgroup *oom_wait_memcg;
1845 struct oom_wait_info *oom_wait_info;
1847 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1848 oom_wait_memcg = oom_wait_info->memcg;
1850 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1851 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1853 return autoremove_wake_function(wait, mode, sync, arg);
1856 static void memcg_oom_recover(struct mem_cgroup *memcg)
1859 * For the following lockless ->under_oom test, the only required
1860 * guarantee is that it must see the state asserted by an OOM when
1861 * this function is called as a result of userland actions
1862 * triggered by the notification of the OOM. This is trivially
1863 * achieved by invoking mem_cgroup_mark_under_oom() before
1864 * triggering notification.
1866 if (memcg && memcg->under_oom)
1867 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1877 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1879 enum oom_status ret;
1882 if (order > PAGE_ALLOC_COSTLY_ORDER)
1885 memcg_memory_event(memcg, MEMCG_OOM);
1888 * We are in the middle of the charge context here, so we
1889 * don't want to block when potentially sitting on a callstack
1890 * that holds all kinds of filesystem and mm locks.
1892 * cgroup1 allows disabling the OOM killer and waiting for outside
1893 * handling until the charge can succeed; remember the context and put
1894 * the task to sleep at the end of the page fault when all locks are
1897 * On the other hand, in-kernel OOM killer allows for an async victim
1898 * memory reclaim (oom_reaper) and that means that we are not solely
1899 * relying on the oom victim to make a forward progress and we can
1900 * invoke the oom killer here.
1902 * Please note that mem_cgroup_out_of_memory might fail to find a
1903 * victim and then we have to bail out from the charge path.
1905 if (memcg->oom_kill_disable) {
1906 if (!current->in_user_fault)
1908 css_get(&memcg->css);
1909 current->memcg_in_oom = memcg;
1910 current->memcg_oom_gfp_mask = mask;
1911 current->memcg_oom_order = order;
1916 mem_cgroup_mark_under_oom(memcg);
1918 locked = mem_cgroup_oom_trylock(memcg);
1921 mem_cgroup_oom_notify(memcg);
1923 mem_cgroup_unmark_under_oom(memcg);
1924 if (mem_cgroup_out_of_memory(memcg, mask, order))
1930 mem_cgroup_oom_unlock(memcg);
1936 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1937 * @handle: actually kill/wait or just clean up the OOM state
1939 * This has to be called at the end of a page fault if the memcg OOM
1940 * handler was enabled.
1942 * Memcg supports userspace OOM handling where failed allocations must
1943 * sleep on a waitqueue until the userspace task resolves the
1944 * situation. Sleeping directly in the charge context with all kinds
1945 * of locks held is not a good idea, instead we remember an OOM state
1946 * in the task and mem_cgroup_oom_synchronize() has to be called at
1947 * the end of the page fault to complete the OOM handling.
1949 * Returns %true if an ongoing memcg OOM situation was detected and
1950 * completed, %false otherwise.
1952 bool mem_cgroup_oom_synchronize(bool handle)
1954 struct mem_cgroup *memcg = current->memcg_in_oom;
1955 struct oom_wait_info owait;
1958 /* OOM is global, do not handle */
1965 owait.memcg = memcg;
1966 owait.wait.flags = 0;
1967 owait.wait.func = memcg_oom_wake_function;
1968 owait.wait.private = current;
1969 INIT_LIST_HEAD(&owait.wait.entry);
1971 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1972 mem_cgroup_mark_under_oom(memcg);
1974 locked = mem_cgroup_oom_trylock(memcg);
1977 mem_cgroup_oom_notify(memcg);
1979 if (locked && !memcg->oom_kill_disable) {
1980 mem_cgroup_unmark_under_oom(memcg);
1981 finish_wait(&memcg_oom_waitq, &owait.wait);
1982 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1983 current->memcg_oom_order);
1986 mem_cgroup_unmark_under_oom(memcg);
1987 finish_wait(&memcg_oom_waitq, &owait.wait);
1991 mem_cgroup_oom_unlock(memcg);
1993 * There is no guarantee that an OOM-lock contender
1994 * sees the wakeups triggered by the OOM kill
1995 * uncharges. Wake any sleepers explicitely.
1997 memcg_oom_recover(memcg);
2000 current->memcg_in_oom = NULL;
2001 css_put(&memcg->css);
2006 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
2007 * @victim: task to be killed by the OOM killer
2008 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
2010 * Returns a pointer to a memory cgroup, which has to be cleaned up
2011 * by killing all belonging OOM-killable tasks.
2013 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
2015 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2016 struct mem_cgroup *oom_domain)
2018 struct mem_cgroup *oom_group = NULL;
2019 struct mem_cgroup *memcg;
2021 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2025 oom_domain = root_mem_cgroup;
2029 memcg = mem_cgroup_from_task(victim);
2030 if (memcg == root_mem_cgroup)
2034 * Traverse the memory cgroup hierarchy from the victim task's
2035 * cgroup up to the OOMing cgroup (or root) to find the
2036 * highest-level memory cgroup with oom.group set.
2038 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2039 if (memcg->oom_group)
2042 if (memcg == oom_domain)
2047 css_get(&oom_group->css);
2054 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2056 pr_info("Tasks in ");
2057 pr_cont_cgroup_path(memcg->css.cgroup);
2058 pr_cont(" are going to be killed due to memory.oom.group set\n");
2062 * lock_page_memcg - lock a page->mem_cgroup binding
2065 * This function protects unlocked LRU pages from being moved to
2068 * It ensures lifetime of the returned memcg. Caller is responsible
2069 * for the lifetime of the page; __unlock_page_memcg() is available
2070 * when @page might get freed inside the locked section.
2072 struct mem_cgroup *lock_page_memcg(struct page *page)
2074 struct mem_cgroup *memcg;
2075 unsigned long flags;
2078 * The RCU lock is held throughout the transaction. The fast
2079 * path can get away without acquiring the memcg->move_lock
2080 * because page moving starts with an RCU grace period.
2082 * The RCU lock also protects the memcg from being freed when
2083 * the page state that is going to change is the only thing
2084 * preventing the page itself from being freed. E.g. writeback
2085 * doesn't hold a page reference and relies on PG_writeback to
2086 * keep off truncation, migration and so forth.
2090 if (mem_cgroup_disabled())
2093 memcg = page->mem_cgroup;
2094 if (unlikely(!memcg))
2097 if (atomic_read(&memcg->moving_account) <= 0)
2100 spin_lock_irqsave(&memcg->move_lock, flags);
2101 if (memcg != page->mem_cgroup) {
2102 spin_unlock_irqrestore(&memcg->move_lock, flags);
2107 * When charge migration first begins, we can have locked and
2108 * unlocked page stat updates happening concurrently. Track
2109 * the task who has the lock for unlock_page_memcg().
2111 memcg->move_lock_task = current;
2112 memcg->move_lock_flags = flags;
2116 EXPORT_SYMBOL(lock_page_memcg);
2119 * __unlock_page_memcg - unlock and unpin a memcg
2122 * Unlock and unpin a memcg returned by lock_page_memcg().
2124 void __unlock_page_memcg(struct mem_cgroup *memcg)
2126 if (memcg && memcg->move_lock_task == current) {
2127 unsigned long flags = memcg->move_lock_flags;
2129 memcg->move_lock_task = NULL;
2130 memcg->move_lock_flags = 0;
2132 spin_unlock_irqrestore(&memcg->move_lock, flags);
2139 * unlock_page_memcg - unlock a page->mem_cgroup binding
2142 void unlock_page_memcg(struct page *page)
2144 __unlock_page_memcg(page->mem_cgroup);
2146 EXPORT_SYMBOL(unlock_page_memcg);
2148 struct memcg_stock_pcp {
2149 struct mem_cgroup *cached; /* this never be root cgroup */
2150 unsigned int nr_pages;
2151 struct work_struct work;
2152 unsigned long flags;
2153 #define FLUSHING_CACHED_CHARGE 0
2155 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2156 static DEFINE_MUTEX(percpu_charge_mutex);
2159 * consume_stock: Try to consume stocked charge on this cpu.
2160 * @memcg: memcg to consume from.
2161 * @nr_pages: how many pages to charge.
2163 * The charges will only happen if @memcg matches the current cpu's memcg
2164 * stock, and at least @nr_pages are available in that stock. Failure to
2165 * service an allocation will refill the stock.
2167 * returns true if successful, false otherwise.
2169 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2171 struct memcg_stock_pcp *stock;
2172 unsigned long flags;
2175 if (nr_pages > MEMCG_CHARGE_BATCH)
2178 local_irq_save(flags);
2180 stock = this_cpu_ptr(&memcg_stock);
2181 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2182 stock->nr_pages -= nr_pages;
2186 local_irq_restore(flags);
2192 * Returns stocks cached in percpu and reset cached information.
2194 static void drain_stock(struct memcg_stock_pcp *stock)
2196 struct mem_cgroup *old = stock->cached;
2198 if (stock->nr_pages) {
2199 page_counter_uncharge(&old->memory, stock->nr_pages);
2200 if (do_memsw_account())
2201 page_counter_uncharge(&old->memsw, stock->nr_pages);
2202 css_put_many(&old->css, stock->nr_pages);
2203 stock->nr_pages = 0;
2205 stock->cached = NULL;
2208 static void drain_local_stock(struct work_struct *dummy)
2210 struct memcg_stock_pcp *stock;
2211 unsigned long flags;
2214 * The only protection from memory hotplug vs. drain_stock races is
2215 * that we always operate on local CPU stock here with IRQ disabled
2217 local_irq_save(flags);
2219 stock = this_cpu_ptr(&memcg_stock);
2221 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2223 local_irq_restore(flags);
2227 * Cache charges(val) to local per_cpu area.
2228 * This will be consumed by consume_stock() function, later.
2230 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2232 struct memcg_stock_pcp *stock;
2233 unsigned long flags;
2235 local_irq_save(flags);
2237 stock = this_cpu_ptr(&memcg_stock);
2238 if (stock->cached != memcg) { /* reset if necessary */
2240 stock->cached = memcg;
2242 stock->nr_pages += nr_pages;
2244 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2247 local_irq_restore(flags);
2251 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2252 * of the hierarchy under it.
2254 static void drain_all_stock(struct mem_cgroup *root_memcg)
2258 /* If someone's already draining, avoid adding running more workers. */
2259 if (!mutex_trylock(&percpu_charge_mutex))
2262 * Notify other cpus that system-wide "drain" is running
2263 * We do not care about races with the cpu hotplug because cpu down
2264 * as well as workers from this path always operate on the local
2265 * per-cpu data. CPU up doesn't touch memcg_stock at all.
2268 for_each_online_cpu(cpu) {
2269 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2270 struct mem_cgroup *memcg;
2272 memcg = stock->cached;
2273 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2275 if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2276 css_put(&memcg->css);
2279 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2281 drain_local_stock(&stock->work);
2283 schedule_work_on(cpu, &stock->work);
2285 css_put(&memcg->css);
2288 mutex_unlock(&percpu_charge_mutex);
2291 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2293 struct memcg_stock_pcp *stock;
2294 struct mem_cgroup *memcg, *mi;
2296 stock = &per_cpu(memcg_stock, cpu);
2299 for_each_mem_cgroup(memcg) {
2302 for (i = 0; i < MEMCG_NR_STAT; i++) {
2306 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2308 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2309 atomic_long_add(x, &memcg->vmstats[i]);
2311 if (i >= NR_VM_NODE_STAT_ITEMS)
2314 for_each_node(nid) {
2315 struct mem_cgroup_per_node *pn;
2317 pn = mem_cgroup_nodeinfo(memcg, nid);
2318 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2321 atomic_long_add(x, &pn->lruvec_stat[i]);
2322 } while ((pn = parent_nodeinfo(pn, nid)));
2326 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2329 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2331 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2332 atomic_long_add(x, &memcg->vmevents[i]);
2339 static void reclaim_high(struct mem_cgroup *memcg,
2340 unsigned int nr_pages,
2344 if (page_counter_read(&memcg->memory) <= memcg->high)
2346 memcg_memory_event(memcg, MEMCG_HIGH);
2347 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2348 } while ((memcg = parent_mem_cgroup(memcg)));
2351 static void high_work_func(struct work_struct *work)
2353 struct mem_cgroup *memcg;
2355 memcg = container_of(work, struct mem_cgroup, high_work);
2356 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2360 * Scheduled by try_charge() to be executed from the userland return path
2361 * and reclaims memory over the high limit.
2363 void mem_cgroup_handle_over_high(void)
2365 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2366 struct mem_cgroup *memcg;
2368 if (likely(!nr_pages))
2371 memcg = get_mem_cgroup_from_mm(current->mm);
2372 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2373 css_put(&memcg->css);
2374 current->memcg_nr_pages_over_high = 0;
2377 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2378 unsigned int nr_pages)
2380 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2381 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2382 struct mem_cgroup *mem_over_limit;
2383 struct page_counter *counter;
2384 unsigned long nr_reclaimed;
2385 bool may_swap = true;
2386 bool drained = false;
2387 enum oom_status oom_status;
2389 if (mem_cgroup_is_root(memcg))
2392 if (consume_stock(memcg, nr_pages))
2395 if (!do_memsw_account() ||
2396 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2397 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2399 if (do_memsw_account())
2400 page_counter_uncharge(&memcg->memsw, batch);
2401 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2403 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2407 if (batch > nr_pages) {
2413 * Unlike in global OOM situations, memcg is not in a physical
2414 * memory shortage. Allow dying and OOM-killed tasks to
2415 * bypass the last charges so that they can exit quickly and
2416 * free their memory.
2418 if (unlikely(should_force_charge()))
2422 * Prevent unbounded recursion when reclaim operations need to
2423 * allocate memory. This might exceed the limits temporarily,
2424 * but we prefer facilitating memory reclaim and getting back
2425 * under the limit over triggering OOM kills in these cases.
2427 if (unlikely(current->flags & PF_MEMALLOC))
2430 if (unlikely(task_in_memcg_oom(current)))
2433 if (!gfpflags_allow_blocking(gfp_mask))
2436 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2438 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2439 gfp_mask, may_swap);
2441 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2445 drain_all_stock(mem_over_limit);
2450 if (gfp_mask & __GFP_NORETRY)
2453 * Even though the limit is exceeded at this point, reclaim
2454 * may have been able to free some pages. Retry the charge
2455 * before killing the task.
2457 * Only for regular pages, though: huge pages are rather
2458 * unlikely to succeed so close to the limit, and we fall back
2459 * to regular pages anyway in case of failure.
2461 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2464 * At task move, charge accounts can be doubly counted. So, it's
2465 * better to wait until the end of task_move if something is going on.
2467 if (mem_cgroup_wait_acct_move(mem_over_limit))
2473 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2476 if (gfp_mask & __GFP_NOFAIL)
2479 if (fatal_signal_pending(current))
2483 * keep retrying as long as the memcg oom killer is able to make
2484 * a forward progress or bypass the charge if the oom killer
2485 * couldn't make any progress.
2487 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2488 get_order(nr_pages * PAGE_SIZE));
2489 switch (oom_status) {
2491 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2499 if (!(gfp_mask & __GFP_NOFAIL))
2503 * The allocation either can't fail or will lead to more memory
2504 * being freed very soon. Allow memory usage go over the limit
2505 * temporarily by force charging it.
2507 page_counter_charge(&memcg->memory, nr_pages);
2508 if (do_memsw_account())
2509 page_counter_charge(&memcg->memsw, nr_pages);
2510 css_get_many(&memcg->css, nr_pages);
2515 css_get_many(&memcg->css, batch);
2516 if (batch > nr_pages)
2517 refill_stock(memcg, batch - nr_pages);
2520 * If the hierarchy is above the normal consumption range, schedule
2521 * reclaim on returning to userland. We can perform reclaim here
2522 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2523 * GFP_KERNEL can consistently be used during reclaim. @memcg is
2524 * not recorded as it most likely matches current's and won't
2525 * change in the meantime. As high limit is checked again before
2526 * reclaim, the cost of mismatch is negligible.
2529 if (page_counter_read(&memcg->memory) > memcg->high) {
2530 /* Don't bother a random interrupted task */
2531 if (in_interrupt()) {
2532 schedule_work(&memcg->high_work);
2535 current->memcg_nr_pages_over_high += batch;
2536 set_notify_resume(current);
2539 } while ((memcg = parent_mem_cgroup(memcg)));
2544 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2546 if (mem_cgroup_is_root(memcg))
2549 page_counter_uncharge(&memcg->memory, nr_pages);
2550 if (do_memsw_account())
2551 page_counter_uncharge(&memcg->memsw, nr_pages);
2553 css_put_many(&memcg->css, nr_pages);
2556 static void lock_page_lru(struct page *page, int *isolated)
2558 pg_data_t *pgdat = page_pgdat(page);
2560 spin_lock_irq(&pgdat->lru_lock);
2561 if (PageLRU(page)) {
2562 struct lruvec *lruvec;
2564 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2566 del_page_from_lru_list(page, lruvec, page_lru(page));
2572 static void unlock_page_lru(struct page *page, int isolated)
2574 pg_data_t *pgdat = page_pgdat(page);
2577 struct lruvec *lruvec;
2579 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2580 VM_BUG_ON_PAGE(PageLRU(page), page);
2582 add_page_to_lru_list(page, lruvec, page_lru(page));
2584 spin_unlock_irq(&pgdat->lru_lock);
2587 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2592 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2595 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2596 * may already be on some other mem_cgroup's LRU. Take care of it.
2599 lock_page_lru(page, &isolated);
2602 * Nobody should be changing or seriously looking at
2603 * page->mem_cgroup at this point:
2605 * - the page is uncharged
2607 * - the page is off-LRU
2609 * - an anonymous fault has exclusive page access, except for
2610 * a locked page table
2612 * - a page cache insertion, a swapin fault, or a migration
2613 * have the page locked
2615 page->mem_cgroup = memcg;
2618 unlock_page_lru(page, isolated);
2621 #ifdef CONFIG_MEMCG_KMEM
2622 static int memcg_alloc_cache_id(void)
2627 id = ida_simple_get(&memcg_cache_ida,
2628 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2632 if (id < memcg_nr_cache_ids)
2636 * There's no space for the new id in memcg_caches arrays,
2637 * so we have to grow them.
2639 down_write(&memcg_cache_ids_sem);
2641 size = 2 * (id + 1);
2642 if (size < MEMCG_CACHES_MIN_SIZE)
2643 size = MEMCG_CACHES_MIN_SIZE;
2644 else if (size > MEMCG_CACHES_MAX_SIZE)
2645 size = MEMCG_CACHES_MAX_SIZE;
2647 err = memcg_update_all_caches(size);
2649 err = memcg_update_all_list_lrus(size);
2651 memcg_nr_cache_ids = size;
2653 up_write(&memcg_cache_ids_sem);
2656 ida_simple_remove(&memcg_cache_ida, id);
2662 static void memcg_free_cache_id(int id)
2664 ida_simple_remove(&memcg_cache_ida, id);
2667 struct memcg_kmem_cache_create_work {
2668 struct mem_cgroup *memcg;
2669 struct kmem_cache *cachep;
2670 struct work_struct work;
2673 static void memcg_kmem_cache_create_func(struct work_struct *w)
2675 struct memcg_kmem_cache_create_work *cw =
2676 container_of(w, struct memcg_kmem_cache_create_work, work);
2677 struct mem_cgroup *memcg = cw->memcg;
2678 struct kmem_cache *cachep = cw->cachep;
2680 memcg_create_kmem_cache(memcg, cachep);
2682 css_put(&memcg->css);
2687 * Enqueue the creation of a per-memcg kmem_cache.
2689 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2690 struct kmem_cache *cachep)
2692 struct memcg_kmem_cache_create_work *cw;
2694 if (!css_tryget_online(&memcg->css))
2697 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2702 cw->cachep = cachep;
2703 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2705 queue_work(memcg_kmem_cache_wq, &cw->work);
2708 static inline bool memcg_kmem_bypass(void)
2710 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2716 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2717 * @cachep: the original global kmem cache
2719 * Return the kmem_cache we're supposed to use for a slab allocation.
2720 * We try to use the current memcg's version of the cache.
2722 * If the cache does not exist yet, if we are the first user of it, we
2723 * create it asynchronously in a workqueue and let the current allocation
2724 * go through with the original cache.
2726 * This function takes a reference to the cache it returns to assure it
2727 * won't get destroyed while we are working with it. Once the caller is
2728 * done with it, memcg_kmem_put_cache() must be called to release the
2731 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2733 struct mem_cgroup *memcg;
2734 struct kmem_cache *memcg_cachep;
2735 struct memcg_cache_array *arr;
2738 VM_BUG_ON(!is_root_cache(cachep));
2740 if (memcg_kmem_bypass())
2745 if (unlikely(current->active_memcg))
2746 memcg = current->active_memcg;
2748 memcg = mem_cgroup_from_task(current);
2750 if (!memcg || memcg == root_mem_cgroup)
2753 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2757 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2760 * Make sure we will access the up-to-date value. The code updating
2761 * memcg_caches issues a write barrier to match the data dependency
2762 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
2764 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2767 * If we are in a safe context (can wait, and not in interrupt
2768 * context), we could be be predictable and return right away.
2769 * This would guarantee that the allocation being performed
2770 * already belongs in the new cache.
2772 * However, there are some clashes that can arrive from locking.
2773 * For instance, because we acquire the slab_mutex while doing
2774 * memcg_create_kmem_cache, this means no further allocation
2775 * could happen with the slab_mutex held. So it's better to
2778 * If the memcg is dying or memcg_cache is about to be released,
2779 * don't bother creating new kmem_caches. Because memcg_cachep
2780 * is ZEROed as the fist step of kmem offlining, we don't need
2781 * percpu_ref_tryget_live() here. css_tryget_online() check in
2782 * memcg_schedule_kmem_cache_create() will prevent us from
2783 * creation of a new kmem_cache.
2785 if (unlikely(!memcg_cachep))
2786 memcg_schedule_kmem_cache_create(memcg, cachep);
2787 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2788 cachep = memcg_cachep;
2795 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2796 * @cachep: the cache returned by memcg_kmem_get_cache
2798 void memcg_kmem_put_cache(struct kmem_cache *cachep)
2800 if (!is_root_cache(cachep))
2801 percpu_ref_put(&cachep->memcg_params.refcnt);
2805 * __memcg_kmem_charge_memcg: charge a kmem page
2806 * @page: page to charge
2807 * @gfp: reclaim mode
2808 * @order: allocation order
2809 * @memcg: memory cgroup to charge
2811 * Returns 0 on success, an error code on failure.
2813 int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2814 struct mem_cgroup *memcg)
2816 unsigned int nr_pages = 1 << order;
2817 struct page_counter *counter;
2820 ret = try_charge(memcg, gfp, nr_pages);
2824 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2825 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2826 cancel_charge(memcg, nr_pages);
2833 * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
2834 * @page: page to charge
2835 * @gfp: reclaim mode
2836 * @order: allocation order
2838 * Returns 0 on success, an error code on failure.
2840 int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2842 struct mem_cgroup *memcg;
2845 if (memcg_kmem_bypass())
2848 memcg = get_mem_cgroup_from_current();
2849 if (!mem_cgroup_is_root(memcg)) {
2850 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2852 page->mem_cgroup = memcg;
2853 __SetPageKmemcg(page);
2856 css_put(&memcg->css);
2861 * __memcg_kmem_uncharge_memcg: uncharge a kmem page
2862 * @memcg: memcg to uncharge
2863 * @nr_pages: number of pages to uncharge
2865 void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
2866 unsigned int nr_pages)
2868 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2869 page_counter_uncharge(&memcg->kmem, nr_pages);
2871 page_counter_uncharge(&memcg->memory, nr_pages);
2872 if (do_memsw_account())
2873 page_counter_uncharge(&memcg->memsw, nr_pages);
2876 * __memcg_kmem_uncharge: uncharge a kmem page
2877 * @page: page to uncharge
2878 * @order: allocation order
2880 void __memcg_kmem_uncharge(struct page *page, int order)
2882 struct mem_cgroup *memcg = page->mem_cgroup;
2883 unsigned int nr_pages = 1 << order;
2888 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2889 __memcg_kmem_uncharge_memcg(memcg, nr_pages);
2890 page->mem_cgroup = NULL;
2892 /* slab pages do not have PageKmemcg flag set */
2893 if (PageKmemcg(page))
2894 __ClearPageKmemcg(page);
2896 css_put_many(&memcg->css, nr_pages);
2898 #endif /* CONFIG_MEMCG_KMEM */
2900 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2903 * Because tail pages are not marked as "used", set it. We're under
2904 * pgdat->lru_lock and migration entries setup in all page mappings.
2906 void mem_cgroup_split_huge_fixup(struct page *head)
2910 if (mem_cgroup_disabled())
2913 for (i = 1; i < HPAGE_PMD_NR; i++)
2914 head[i].mem_cgroup = head->mem_cgroup;
2916 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2918 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2920 #ifdef CONFIG_MEMCG_SWAP
2922 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2923 * @entry: swap entry to be moved
2924 * @from: mem_cgroup which the entry is moved from
2925 * @to: mem_cgroup which the entry is moved to
2927 * It succeeds only when the swap_cgroup's record for this entry is the same
2928 * as the mem_cgroup's id of @from.
2930 * Returns 0 on success, -EINVAL on failure.
2932 * The caller must have charged to @to, IOW, called page_counter_charge() about
2933 * both res and memsw, and called css_get().
2935 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2936 struct mem_cgroup *from, struct mem_cgroup *to)
2938 unsigned short old_id, new_id;
2940 old_id = mem_cgroup_id(from);
2941 new_id = mem_cgroup_id(to);
2943 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2944 mod_memcg_state(from, MEMCG_SWAP, -1);
2945 mod_memcg_state(to, MEMCG_SWAP, 1);
2951 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2952 struct mem_cgroup *from, struct mem_cgroup *to)
2958 static DEFINE_MUTEX(memcg_max_mutex);
2960 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2961 unsigned long max, bool memsw)
2963 bool enlarge = false;
2964 bool drained = false;
2966 bool limits_invariant;
2967 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2970 if (signal_pending(current)) {
2975 mutex_lock(&memcg_max_mutex);
2977 * Make sure that the new limit (memsw or memory limit) doesn't
2978 * break our basic invariant rule memory.max <= memsw.max.
2980 limits_invariant = memsw ? max >= memcg->memory.max :
2981 max <= memcg->memsw.max;
2982 if (!limits_invariant) {
2983 mutex_unlock(&memcg_max_mutex);
2987 if (max > counter->max)
2989 ret = page_counter_set_max(counter, max);
2990 mutex_unlock(&memcg_max_mutex);
2996 drain_all_stock(memcg);
3001 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3002 GFP_KERNEL, !memsw)) {
3008 if (!ret && enlarge)
3009 memcg_oom_recover(memcg);
3014 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3016 unsigned long *total_scanned)
3018 unsigned long nr_reclaimed = 0;
3019 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3020 unsigned long reclaimed;
3022 struct mem_cgroup_tree_per_node *mctz;
3023 unsigned long excess;
3024 unsigned long nr_scanned;
3029 mctz = soft_limit_tree_node(pgdat->node_id);
3032 * Do not even bother to check the largest node if the root
3033 * is empty. Do it lockless to prevent lock bouncing. Races
3034 * are acceptable as soft limit is best effort anyway.
3036 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3040 * This loop can run a while, specially if mem_cgroup's continuously
3041 * keep exceeding their soft limit and putting the system under
3048 mz = mem_cgroup_largest_soft_limit_node(mctz);
3053 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3054 gfp_mask, &nr_scanned);
3055 nr_reclaimed += reclaimed;
3056 *total_scanned += nr_scanned;
3057 spin_lock_irq(&mctz->lock);
3058 __mem_cgroup_remove_exceeded(mz, mctz);
3061 * If we failed to reclaim anything from this memory cgroup
3062 * it is time to move on to the next cgroup
3066 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3068 excess = soft_limit_excess(mz->memcg);
3070 * One school of thought says that we should not add
3071 * back the node to the tree if reclaim returns 0.
3072 * But our reclaim could return 0, simply because due
3073 * to priority we are exposing a smaller subset of
3074 * memory to reclaim from. Consider this as a longer
3077 /* If excess == 0, no tree ops */
3078 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3079 spin_unlock_irq(&mctz->lock);
3080 css_put(&mz->memcg->css);
3083 * Could not reclaim anything and there are no more
3084 * mem cgroups to try or we seem to be looping without
3085 * reclaiming anything.
3087 if (!nr_reclaimed &&
3089 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3091 } while (!nr_reclaimed);
3093 css_put(&next_mz->memcg->css);
3094 return nr_reclaimed;
3098 * Test whether @memcg has children, dead or alive. Note that this
3099 * function doesn't care whether @memcg has use_hierarchy enabled and
3100 * returns %true if there are child csses according to the cgroup
3101 * hierarchy. Testing use_hierarchy is the caller's responsiblity.
3103 static inline bool memcg_has_children(struct mem_cgroup *memcg)
3108 ret = css_next_child(NULL, &memcg->css);
3114 * Reclaims as many pages from the given memcg as possible.
3116 * Caller is responsible for holding css reference for memcg.
3118 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3120 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3122 /* we call try-to-free pages for make this cgroup empty */
3123 lru_add_drain_all();
3125 drain_all_stock(memcg);
3127 /* try to free all pages in this cgroup */
3128 while (nr_retries && page_counter_read(&memcg->memory)) {
3131 if (signal_pending(current))
3134 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3138 /* maybe some writeback is necessary */
3139 congestion_wait(BLK_RW_ASYNC, HZ/10);
3147 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3148 char *buf, size_t nbytes,
3151 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3153 if (mem_cgroup_is_root(memcg))
3155 return mem_cgroup_force_empty(memcg) ?: nbytes;
3158 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3161 return mem_cgroup_from_css(css)->use_hierarchy;
3164 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3165 struct cftype *cft, u64 val)
3168 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3169 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3171 if (memcg->use_hierarchy == val)
3175 * If parent's use_hierarchy is set, we can't make any modifications
3176 * in the child subtrees. If it is unset, then the change can
3177 * occur, provided the current cgroup has no children.
3179 * For the root cgroup, parent_mem is NULL, we allow value to be
3180 * set if there are no children.
3182 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3183 (val == 1 || val == 0)) {
3184 if (!memcg_has_children(memcg))
3185 memcg->use_hierarchy = val;
3194 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3198 if (mem_cgroup_is_root(memcg)) {
3199 val = memcg_page_state(memcg, MEMCG_CACHE) +
3200 memcg_page_state(memcg, MEMCG_RSS);
3202 val += memcg_page_state(memcg, MEMCG_SWAP);
3205 val = page_counter_read(&memcg->memory);
3207 val = page_counter_read(&memcg->memsw);
3220 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3223 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3224 struct page_counter *counter;
3226 switch (MEMFILE_TYPE(cft->private)) {
3228 counter = &memcg->memory;
3231 counter = &memcg->memsw;
3234 counter = &memcg->kmem;
3237 counter = &memcg->tcpmem;
3243 switch (MEMFILE_ATTR(cft->private)) {
3245 if (counter == &memcg->memory)
3246 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3247 if (counter == &memcg->memsw)
3248 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3249 return (u64)page_counter_read(counter) * PAGE_SIZE;
3251 return (u64)counter->max * PAGE_SIZE;
3253 return (u64)counter->watermark * PAGE_SIZE;
3255 return counter->failcnt;
3256 case RES_SOFT_LIMIT:
3257 return (u64)memcg->soft_limit * PAGE_SIZE;
3263 static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3265 unsigned long stat[MEMCG_NR_STAT];
3266 struct mem_cgroup *mi;
3269 for (i = 0; i < MEMCG_NR_STAT; i++)
3272 for_each_online_cpu(cpu)
3273 for (i = 0; i < MEMCG_NR_STAT; i++)
3274 stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
3276 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3277 for (i = 0; i < MEMCG_NR_STAT; i++)
3278 atomic_long_add(stat[i], &mi->vmstats[i]);
3280 for_each_node(node) {
3281 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3282 struct mem_cgroup_per_node *pi;
3284 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3287 for_each_online_cpu(cpu)
3288 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3289 stat[i] += raw_cpu_read(
3290 pn->lruvec_stat_cpu->count[i]);
3292 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3293 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3294 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3298 static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3300 unsigned long events[NR_VM_EVENT_ITEMS];
3301 struct mem_cgroup *mi;
3304 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3307 for_each_online_cpu(cpu)
3308 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3309 events[i] += raw_cpu_read(
3310 memcg->vmstats_percpu->events[i]);
3312 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3313 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3314 atomic_long_add(events[i], &mi->vmevents[i]);
3317 #ifdef CONFIG_MEMCG_KMEM
3318 static int memcg_online_kmem(struct mem_cgroup *memcg)
3322 if (cgroup_memory_nokmem)
3325 BUG_ON(memcg->kmemcg_id >= 0);
3326 BUG_ON(memcg->kmem_state);
3328 memcg_id = memcg_alloc_cache_id();
3332 static_branch_inc(&memcg_kmem_enabled_key);
3334 * A memory cgroup is considered kmem-online as soon as it gets
3335 * kmemcg_id. Setting the id after enabling static branching will
3336 * guarantee no one starts accounting before all call sites are
3339 memcg->kmemcg_id = memcg_id;
3340 memcg->kmem_state = KMEM_ONLINE;
3341 INIT_LIST_HEAD(&memcg->kmem_caches);
3346 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3348 struct cgroup_subsys_state *css;
3349 struct mem_cgroup *parent, *child;
3352 if (memcg->kmem_state != KMEM_ONLINE)
3355 * Clear the online state before clearing memcg_caches array
3356 * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3357 * guarantees that no cache will be created for this cgroup
3358 * after we are done (see memcg_create_kmem_cache()).
3360 memcg->kmem_state = KMEM_ALLOCATED;
3362 parent = parent_mem_cgroup(memcg);
3364 parent = root_mem_cgroup;
3366 memcg_deactivate_kmem_caches(memcg, parent);
3368 kmemcg_id = memcg->kmemcg_id;
3369 BUG_ON(kmemcg_id < 0);
3372 * Change kmemcg_id of this cgroup and all its descendants to the
3373 * parent's id, and then move all entries from this cgroup's list_lrus
3374 * to ones of the parent. After we have finished, all list_lrus
3375 * corresponding to this cgroup are guaranteed to remain empty. The
3376 * ordering is imposed by list_lru_node->lock taken by
3377 * memcg_drain_all_list_lrus().
3379 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3380 css_for_each_descendant_pre(css, &memcg->css) {
3381 child = mem_cgroup_from_css(css);
3382 BUG_ON(child->kmemcg_id != kmemcg_id);
3383 child->kmemcg_id = parent->kmemcg_id;
3384 if (!memcg->use_hierarchy)
3389 memcg_drain_all_list_lrus(kmemcg_id, parent);
3391 memcg_free_cache_id(kmemcg_id);
3394 static void memcg_free_kmem(struct mem_cgroup *memcg)
3396 /* css_alloc() failed, offlining didn't happen */
3397 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3398 memcg_offline_kmem(memcg);
3400 if (memcg->kmem_state == KMEM_ALLOCATED) {
3401 WARN_ON(!list_empty(&memcg->kmem_caches));
3402 static_branch_dec(&memcg_kmem_enabled_key);
3406 static int memcg_online_kmem(struct mem_cgroup *memcg)
3410 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3413 static void memcg_free_kmem(struct mem_cgroup *memcg)
3416 #endif /* CONFIG_MEMCG_KMEM */