memcg: fix hierarchical reclaim
[sfrench/cifs-2.6.git] / mm / memcontrol.c
index e2996b80601f8fd20a3bb9d1216ebf3017d11afa..322625f551c27504c89d3f0f743bd58da32f909f 100644 (file)
@@ -358,6 +358,10 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
 
        pc = lookup_page_cgroup(page);
+       /*
+        * Used bit is set without atomic ops but after smp_wmb().
+        * For making pc->mem_cgroup visible, insert smp_rmb() here.
+        */
        smp_rmb();
        /* unused page is not rotated. */
        if (!PageCgroupUsed(pc))
@@ -374,7 +378,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
        if (mem_cgroup_disabled())
                return;
        pc = lookup_page_cgroup(page);
-       /* barrier to sync with "charge" */
+       /*
+        * Used bit is set without atomic ops but after smp_wmb().
+        * For making pc->mem_cgroup visible, insert smp_rmb() here.
+        */
        smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
@@ -559,6 +566,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
 
        pc = lookup_page_cgroup(page);
+       /*
+        * Used bit is set without atomic ops but after smp_wmb().
+        * For making pc->mem_cgroup visible, insert smp_rmb() here.
+        */
+       smp_rmb();
+       if (!PageCgroupUsed(pc))
+               return NULL;
+
        mz = page_cgroup_zoneinfo(pc);
        if (!mz)
                return NULL;
@@ -618,7 +633,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  * called with hierarchy_mutex held
  */
 static struct mem_cgroup *
-mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
+__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
 {
        struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
 
@@ -629,19 +644,16 @@ mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
                /*
                 * Walk down to children
                 */
-               mem_cgroup_put(curr);
                cgroup = list_entry(curr_cgroup->children.next,
                                                struct cgroup, sibling);
                curr = mem_cgroup_from_cont(cgroup);
-               mem_cgroup_get(curr);
                goto done;
        }
 
 visit_parent:
        if (curr_cgroup == root_cgroup) {
-               mem_cgroup_put(curr);
-               curr = root_mem;
-               mem_cgroup_get(curr);
+               /* caller handles NULL case */
+               curr = NULL;
                goto done;
        }
 
@@ -649,11 +661,9 @@ visit_parent:
         * Goto next sibling
         */
        if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
-               mem_cgroup_put(curr);
                cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
                                                sibling);
                curr = mem_cgroup_from_cont(cgroup);
-               mem_cgroup_get(curr);
                goto done;
        }
 
@@ -664,7 +674,6 @@ visit_parent:
        goto visit_parent;
 
 done:
-       root_mem->last_scanned_child = curr;
        return curr;
 }
 
@@ -674,40 +683,46 @@ done:
  * that to reclaim free pages from.
  */
 static struct mem_cgroup *
-mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
+mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
 {
        struct cgroup *cgroup;
-       struct mem_cgroup *ret;
+       struct mem_cgroup *orig, *next;
        bool obsolete;
 
-       obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
-
        /*
         * Scan all children under the mem_cgroup mem
         */
        mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+
+       orig = root_mem->last_scanned_child;
+       obsolete = mem_cgroup_is_obsolete(orig);
+
        if (list_empty(&root_mem->css.cgroup->children)) {
-               ret = root_mem;
+               /*
+                * root_mem might have children before and last_scanned_child
+                * may point to one of them. We put it later.
+                */
+               if (orig)
+                       VM_BUG_ON(!obsolete);
+               next = NULL;
                goto done;
        }
 
-       if (!root_mem->last_scanned_child || obsolete) {
-
-               if (obsolete && root_mem->last_scanned_child)
-                       mem_cgroup_put(root_mem->last_scanned_child);
-
+       if (!orig || obsolete) {
                cgroup = list_first_entry(&root_mem->css.cgroup->children,
                                struct cgroup, sibling);
-               ret = mem_cgroup_from_cont(cgroup);
-               mem_cgroup_get(ret);
+               next = mem_cgroup_from_cont(cgroup);
        } else
-               ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
-                                               root_mem);
+               next = __mem_cgroup_get_next_node(orig, root_mem);
 
 done:
-       root_mem->last_scanned_child = ret;
+       if (next)
+               mem_cgroup_get(next);
+       root_mem->last_scanned_child = next;
+       if (orig)
+               mem_cgroup_put(orig);
        mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
-       return ret;
+       return (next) ? next : root_mem;
 }
 
 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
@@ -765,21 +780,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        if (!root_mem->use_hierarchy)
                return ret;
 
-       next_mem = mem_cgroup_get_first_node(root_mem);
+       next_mem = mem_cgroup_get_next_node(root_mem);
 
        while (next_mem != root_mem) {
                if (mem_cgroup_is_obsolete(next_mem)) {
-                       mem_cgroup_put(next_mem);
-                       next_mem = mem_cgroup_get_first_node(root_mem);
+                       next_mem = mem_cgroup_get_next_node(root_mem);
                        continue;
                }
                ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
                                                   get_swappiness(next_mem));
                if (mem_cgroup_check_under_limit(root_mem))
                        return 0;
-               mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
-               next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
-               mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+               next_mem = mem_cgroup_get_next_node(root_mem);
        }
        return ret;
 }
@@ -979,14 +991,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        if (pc->mem_cgroup != from)
                goto out;
 
-       css_put(&from->css);
        res_counter_uncharge(&from->res, PAGE_SIZE);
        mem_cgroup_charge_statistics(from, pc, false);
        if (do_swap_account)
                res_counter_uncharge(&from->memsw, PAGE_SIZE);
+       css_put(&from->css);
+
+       css_get(&to->css);
        pc->mem_cgroup = to;
        mem_cgroup_charge_statistics(to, pc, true);
-       css_get(&to->css);
        ret = 0;
 out:
        unlock_page_cgroup(pc);
@@ -1019,8 +1032,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (ret || !parent)
                return ret;
 
-       if (!get_page_unless_zero(page))
-               return -EBUSY;
+       if (!get_page_unless_zero(page)) {
+               ret = -EBUSY;
+               goto uncharge;
+       }
 
        ret = isolate_lru_page(page);
 
@@ -1029,19 +1044,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 
        ret = mem_cgroup_move_account(pc, child, parent);
 
-       /* drop extra refcnt by try_charge() (move_account increment one) */
-       css_put(&parent->css);
        putback_lru_page(page);
        if (!ret) {
                put_page(page);
+               /* drop extra refcnt by try_charge() */
+               css_put(&parent->css);
                return 0;
        }
-       /* uncharge if move fails */
+
 cancel:
+       put_page(page);
+uncharge:
+       /* drop extra refcnt by try_charge() */
+       css_put(&parent->css);
+       /* uncharge if move fails */
        res_counter_uncharge(&parent->res, PAGE_SIZE);
        if (do_swap_account)
                res_counter_uncharge(&parent->memsw, PAGE_SIZE);
-       put_page(page);
        return ret;
 }
 
@@ -2232,7 +2251,14 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-       mem_cgroup_put(mem_cgroup_from_cont(cont));
+       struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
+
+       if (last_scanned_child) {
+               VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
+               mem_cgroup_put(last_scanned_child);
+       }
+       mem_cgroup_put(mem);
 }
 
 static int mem_cgroup_populate(struct cgroup_subsys *ss,