x86/mm, mm/hwpoison: Don't unconditionally unmap kernel 1:1 pages
[sfrench/cifs-2.6.git] / include / linux / memcontrol.h
index 69966c461d1c1164c155f3c5ff2ee9d4c7220911..8820468635810a3d801a5dd52beca94a7a14b035 100644 (file)
@@ -108,7 +108,10 @@ struct lruvec_stat {
  */
 struct mem_cgroup_per_node {
        struct lruvec           lruvec;
-       struct lruvec_stat __percpu *lruvec_stat;
+
+       struct lruvec_stat __percpu *lruvec_stat_cpu;
+       atomic_long_t           lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+
        unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
        struct mem_cgroup_reclaim_iter  iter[DEF_PRIORITY + 1];
@@ -227,10 +230,10 @@ struct mem_cgroup {
        spinlock_t              move_lock;
        struct task_struct      *move_lock_task;
        unsigned long           move_lock_flags;
-       /*
-        * percpu counter.
-        */
-       struct mem_cgroup_stat_cpu __percpu *stat;
+
+       struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+       atomic_long_t           stat[MEMCG_NR_STAT];
+       atomic_long_t           events[MEMCG_NR_EVENTS];
 
        unsigned long           socket_pressure;
 
@@ -265,6 +268,12 @@ struct mem_cgroup {
        /* WARNING: nodeinfo must be the last member here */
 };
 
+/*
+ * size of first charge trial. "32" comes from vmscan.c's magic value.
+ * TODO: maybe necessary to use big numbers in big irons.
+ */
+#define MEMCG_CHARGE_BATCH 32U
+
 extern struct mem_cgroup *root_mem_cgroup;
 
 static inline bool mem_cgroup_disabled(void)
@@ -272,13 +281,6 @@ static inline bool mem_cgroup_disabled(void)
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
-                                   enum memcg_event_item event)
-{
-       this_cpu_inc(memcg->stat->events[event]);
-       cgroup_file_notify(&memcg->events_file);
-}
-
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
@@ -492,32 +494,38 @@ void unlock_page_memcg(struct page *page);
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
                                             int idx)
 {
-       long val = 0;
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               val += per_cpu(memcg->stat->count[idx], cpu);
-
-       if (val < 0)
-               val = 0;
-
-       return val;
+       long x = atomic_long_read(&memcg->stat[idx]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                     int idx, int val)
 {
-       if (!mem_cgroup_disabled())
-               __this_cpu_add(memcg->stat->count[idx], val);
+       long x;
+
+       if (mem_cgroup_disabled())
+               return;
+
+       x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
+       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+               atomic_long_add(x, &memcg->stat[idx]);
+               x = 0;
+       }
+       __this_cpu_write(memcg->stat_cpu->count[idx], x);
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   int idx, int val)
 {
-       if (!mem_cgroup_disabled())
-               this_cpu_add(memcg->stat->count[idx], val);
+       preempt_disable();
+       __mod_memcg_state(memcg, idx, val);
+       preempt_enable();
 }
 
 /**
@@ -555,87 +563,108 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
 {
        struct mem_cgroup_per_node *pn;
-       long val = 0;
-       int cpu;
+       long x;
 
        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       for_each_possible_cpu(cpu)
-               val += per_cpu(pn->lruvec_stat->count[idx], cpu);
-
-       if (val < 0)
-               val = 0;
-
-       return val;
+       x = atomic_long_read(&pn->lruvec_stat[idx]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
 }
 
 static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
 {
        struct mem_cgroup_per_node *pn;
+       long x;
 
+       /* Update node */
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
        if (mem_cgroup_disabled())
                return;
+
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+
+       /* Update memcg */
        __mod_memcg_state(pn->memcg, idx, val);
-       __this_cpu_add(pn->lruvec_stat->count[idx], val);
+
+       /* Update lruvec */
+       x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+               atomic_long_add(x, &pn->lruvec_stat[idx]);
+               x = 0;
+       }
+       __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 }
 
 static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
 {
-       struct mem_cgroup_per_node *pn;
-
-       mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-       if (mem_cgroup_disabled())
-               return;
-       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       mod_memcg_state(pn->memcg, idx, val);
-       this_cpu_add(pn->lruvec_stat->count[idx], val);
+       preempt_disable();
+       __mod_lruvec_state(lruvec, idx, val);
+       preempt_enable();
 }
 
 static inline void __mod_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx, int val)
 {
-       struct mem_cgroup_per_node *pn;
+       pg_data_t *pgdat = page_pgdat(page);
+       struct lruvec *lruvec;
 
-       __mod_node_page_state(page_pgdat(page), idx, val);
-       if (mem_cgroup_disabled() || !page->mem_cgroup)
+       /* Untracked pages have no memcg, no lruvec. Update only the node */
+       if (!page->mem_cgroup) {
+               __mod_node_page_state(pgdat, idx, val);
                return;
-       __mod_memcg_state(page->mem_cgroup, idx, val);
-       pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
-       __this_cpu_add(pn->lruvec_stat->count[idx], val);
+       }
+
+       lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup);
+       __mod_lruvec_state(lruvec, idx, val);
 }
 
 static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
 {
-       struct mem_cgroup_per_node *pn;
-
-       mod_node_page_state(page_pgdat(page), idx, val);
-       if (mem_cgroup_disabled() || !page->mem_cgroup)
-               return;
-       mod_memcg_state(page->mem_cgroup, idx, val);
-       pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
-       this_cpu_add(pn->lruvec_stat->count[idx], val);
+       preempt_disable();
+       __mod_lruvec_page_state(page, idx, val);
+       preempt_enable();
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);
 
+/* idx can be of type enum memcg_event_item or vm_event_item */
+static inline void __count_memcg_events(struct mem_cgroup *memcg,
+                                       int idx, unsigned long count)
+{
+       unsigned long x;
+
+       if (mem_cgroup_disabled())
+               return;
+
+       x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
+       if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+               atomic_long_add(x, &memcg->events[idx]);
+               x = 0;
+       }
+       __this_cpu_write(memcg->stat_cpu->events[idx], x);
+}
+
 static inline void count_memcg_events(struct mem_cgroup *memcg,
-                                     enum vm_event_item idx,
-                                     unsigned long count)
+                                     int idx, unsigned long count)
 {
-       if (!mem_cgroup_disabled())
-               this_cpu_add(memcg->stat->events[idx], count);
+       preempt_disable();
+       __count_memcg_events(memcg, idx, count);
+       preempt_enable();
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
+/* idx can be of type enum memcg_event_item or vm_event_item */
 static inline void count_memcg_page_event(struct page *page,
                                          int idx)
 {
@@ -654,12 +683,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg)) {
-               this_cpu_inc(memcg->stat->events[idx]);
+               count_memcg_events(memcg, idx, 1);
                if (idx == OOM_KILL)
                        cgroup_file_notify(&memcg->events_file);
        }
        rcu_read_unlock();
 }
+
+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+                                   enum memcg_event_item event)
+{
+       count_memcg_events(memcg, event, 1);
+       cgroup_file_notify(&memcg->events_file);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
 #endif