From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 3 Apr 2014 20:05:42 +0000 (-0700)
Subject: Merge branch 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
X-Git-Tag: v3.15-rc1~109
X-Git-Url: http://git.samba.org/samba.git/?p=sfrench%2Fcifs-2.6.git;a=commitdiff_plain;h=32d01dc7be4e725ab85ce1d74e8f4adc02ad68dd

Merge branch 'for-3.15' of git://git./linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "A lot updates for cgroup:

   - The biggest one is cgroup's conversion to kernfs.  cgroup took
     after the long abandoned vfs-entangled sysfs implementation and
     made it even more convoluted over time.  cgroup's internal objects
     were fused with vfs objects which also brought in vfs locking and
     object lifetime rules.  Naturally, there are places where vfs rules
     don't fit and nasty hacks, such as credential switching or lock
     dance interleaving inode mutex and cgroup_mutex with object serial
     number comparison thrown in to decide whether the operation is
     actually necessary, needed to be employed.

     After conversion to kernfs, internal object lifetime and locking
     rules are mostly isolated from vfs interactions allowing shedding
     of several nasty hacks and overall simplification.  This will also
     allow implmentation of operations which may affect multiple cgroups
     which weren't possible before as it would have required nesting
     i_mutexes.

   - Various simplifications including dropping of module support,
     easier cgroup name/path handling, simplified cgroup file type
     handling and task_cg_lists optimization.

   - Prepatory changes for the planned unified hierarchy, which is still
     a patchset away from being actually operational.  The dummy
     hierarchy is updated to serve as the default unified hierarchy.
     Controllers which aren't claimed by other hierarchies are
     associated with it, which BTW was what the dummy hierarchy was for
     anyway.

   - Various fixes from Li and others.  This pull request includes some
     patches to add missing slab.h to various subsystems.  This was
     triggered xattr.h include removal from cgroup.h.  cgroup.h
     indirectly got included a lot of files which brought in xattr.h
     which brought in slab.h.

  There are several merge commits - one to pull in kernfs updates
  necessary for converting cgroup (already in upstream through
  driver-core), others for interfering changes in the fixes branch"

* 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (74 commits)
  cgroup: remove useless argument from cgroup_exit()
  cgroup: fix spurious lockdep warning in cgroup_exit()
  cgroup: Use RCU_INIT_POINTER(x, NULL) in cgroup.c
  cgroup: break kernfs active_ref protection in cgroup directory operations
  cgroup: fix cgroup_taskset walking order
  cgroup: implement CFTYPE_ONLY_ON_DFL
  cgroup: make cgrp_dfl_root mountable
  cgroup: drop const from @buffer of cftype->write_string()
  cgroup: rename cgroup_dummy_root and related names
  cgroup: move ->subsys_mask from cgroupfs_root to cgroup
  cgroup: treat cgroup_dummy_root as an equivalent hierarchy during rebinding
  cgroup: remove NULL checks from [pr_cont_]cgroup_{name|path}()
  cgroup: use cgroup_setup_root() to initialize cgroup_dummy_root
  cgroup: reorganize cgroup bootstrapping
  cgroup: relocate setting of CGRP_DEAD
  cpuset: use rcu_read_lock() to protect task_cs()
  cgroup_freezer: document freezer_fork() subtleties
  cgroup: update cgroup_transfer_tasks() to either succeed or fail
  cgroup: drop task_lock() protection around task->cgroups
  cgroup: update how a newly forked task gets associated with css_set
  ...
---

32d01dc7be4e725ab85ce1d74e8f4adc02ad68dd
diff --cc kernel/cgroup.c
index 0c753ddd223b,f7f94322d312..fede3d3f28ff
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -1658,80 -1550,72 +1550,72 @@@ retry
  				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
  			}
  		}
- 	}
- 
- 	kfree(opts.release_agent);
- 	kfree(opts.name);
- 	return dget(sb->s_root);
- 
-  rm_base_files:
- 	free_cgrp_cset_links(&tmp_links);
- 	cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
- 	revert_creds(cred);
-  unlock_drop:
- 	cgroup_exit_root_id(root);
- 	mutex_unlock(&cgroup_root_mutex);
- 	mutex_unlock(&cgroup_mutex);
- 	mutex_unlock(&inode->i_mutex);
-  drop_new_super:
- 	deactivate_locked_super(sb);
-  out_err:
- 	kfree(opts.release_agent);
- 	kfree(opts.name);
- 	return ERR_PTR(ret);
- }
  
- static void cgroup_kill_sb(struct super_block *sb)
- {
- 	struct cgroupfs_root *root = sb->s_fs_info;
- 	struct cgroup *cgrp = &root->top_cgroup;
- 	struct cgrp_cset_link *link, *tmp_link;
- 	int ret;
+ 		/*
+ 		 * A root's lifetime is governed by its root cgroup.  Zero
+ 		 * ref indicate that the root is being destroyed.  Wait for
+ 		 * destruction to complete so that the subsystems are free.
+ 		 * We can use wait_queue for the wait but this path is
+ 		 * super cold.  Let's just sleep for a bit and retry.
+ 		 */
+ 		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
+ 			mutex_unlock(&cgroup_mutex);
+ 			mutex_unlock(&cgroup_tree_mutex);
+ 			kfree(opts.release_agent);
+ 			kfree(opts.name);
+ 			msleep(10);
+ 			goto retry;
+ 		}
  
- 	BUG_ON(!root);
- 
- 	BUG_ON(root->number_of_cgroups != 1);
- 	BUG_ON(!list_empty(&cgrp->children));
- 
- 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
- 	mutex_lock(&cgroup_mutex);
- 	mutex_lock(&cgroup_root_mutex);
- 
- 	/* Rebind all subsystems back to the default hierarchy */
- 	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
- 		ret = rebind_subsystems(root, 0, root->subsys_mask);
- 		/* Shouldn't be able to fail ... */
- 		BUG_ON(ret);
- 	}
+ 		ret = 0;
+ 		goto out_unlock;
+ 	}
  
  	/*
- 	 * Release all the links from cset_links to this hierarchy's
- 	 * root cgroup
+ 	 * No such thing, create a new one.  name= matching without subsys
+ 	 * specification is allowed for already existing hierarchies but we
+ 	 * can't create new one without subsys specification.
  	 */
- 	write_lock(&css_set_lock);
- 
- 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
- 		list_del(&link->cset_link);
- 		list_del(&link->cgrp_link);
- 		kfree(link);
+ 	if (!opts.subsys_mask && !opts.none) {
+ 		ret = -EINVAL;
+ 		goto out_unlock;
  	}
- 	write_unlock(&css_set_lock);
  
- 	if (!list_empty(&root->root_list)) {
- 		list_del(&root->root_list);
- 		cgroup_root_count--;
+ 	root = kzalloc(sizeof(*root), GFP_KERNEL);
+ 	if (!root) {
+ 		ret = -ENOMEM;
+ 		goto out_unlock;
  	}
  
- 	cgroup_exit_root_id(root);
+ 	init_cgroup_root(root, &opts);
+ 
+ 	ret = cgroup_setup_root(root, opts.subsys_mask);
+ 	if (ret)
+ 		cgroup_free_root(root);
  
- 	mutex_unlock(&cgroup_root_mutex);
+ out_unlock:
  	mutex_unlock(&cgroup_mutex);
- 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ 	mutex_unlock(&cgroup_tree_mutex);
  
- 	simple_xattrs_free(&cgrp->xattrs);
+ 	kfree(opts.release_agent);
+ 	kfree(opts.name);
  
- 	kill_litter_super(sb);
- 	cgroup_free_root(root);
+ 	if (ret)
+ 		return ERR_PTR(ret);
+ 
 -	dentry = kernfs_mount(fs_type, flags, root->kf_root);
++	dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
+ 	if (IS_ERR(dentry))
+ 		cgroup_put(&root->cgrp);
+ 	return dentry;
+ }
+ 
+ static void cgroup_kill_sb(struct super_block *sb)
+ {
+ 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ 
+ 	cgroup_put(&root->cgrp);
+ 	kernfs_kill_sb(sb);
  }
  
  static struct file_system_type cgroup_fs_type = {
@@@ -4116,17 -3629,19 +3629,19 @@@ static int create_css(struct cgroup *cg
  
  	init_css(css, ss, cgrp);
  
- 	err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+ 	err = cgroup_populate_dir(cgrp, 1 << ss->id);
  	if (err)
 -		goto err_free;
 +		goto err_free_percpu_ref;
  
  	err = online_css(css);
  	if (err)
 -		goto err_free;
 +		goto err_clear_dir;
  
- 	dget(cgrp->dentry);
+ 	cgroup_get(cgrp);
  	css_get(css->parent);
  
+ 	cgrp->subsys_mask |= 1 << ss->id;
+ 
  	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
  	    parent->parent) {
  		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@@ -4138,11 -3653,8 +3653,11 @@@
  
  	return 0;
  
 -err_free:
 +err_clear_dir:
- 	cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
++	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 +err_free_percpu_ref:
  	percpu_ref_cancel_init(&css->refcnt);
 +err_free_css:
  	ss->css_free(css);
  	return err;
  }
diff --cc kernel/cpuset.c
index e6b1b66afe52,efbf9baf77ec..e2dbb60004d4
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@@ -2480,11 -2414,11 +2410,11 @@@ int __cpuset_node_allowed_softwall(int 
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
  	mutex_lock(&callback_mutex);
  
- 	task_lock(current);
+ 	rcu_read_lock();
  	cs = nearest_hardwall_ancestor(task_cs(current));
 +	allowed = node_isset(node, cs->mems_allowed);
- 	task_unlock(current);
+ 	rcu_read_unlock();
  
 -	allowed = node_isset(node, cs->mems_allowed);
  	mutex_unlock(&callback_mutex);
  	return allowed;
  }
diff --cc mm/memcontrol.c
index 5b6b0039f725,96f94a9f2faf..dcc8153a1681
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -1683,54 -1683,25 +1683,25 @@@ static void move_unlock_mem_cgroup(stru
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
- 	/*
- 	 * protects memcg_name and makes sure that parallel ooms do not
- 	 * interleave
- 	 */
+ 	/* oom_info_lock ensures that parallel ooms do not interleave */
 -	static DEFINE_SPINLOCK(oom_info_lock);
 +	static DEFINE_MUTEX(oom_info_lock);
- 	struct cgroup *task_cgrp;
- 	struct cgroup *mem_cgrp;
- 	static char memcg_name[PATH_MAX];
- 	int ret;
  	struct mem_cgroup *iter;
  	unsigned int i;
  
  	if (!p)
  		return;
  
 -	spin_lock(&oom_info_lock);
 +	mutex_lock(&oom_info_lock);
  	rcu_read_lock();
  
- 	mem_cgrp = memcg->css.cgroup;
- 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
- 
- 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
- 	if (ret < 0) {
- 		/*
- 		 * Unfortunately, we are unable to convert to a useful name
- 		 * But we'll still print out the usage information
- 		 */
- 		rcu_read_unlock();
- 		goto done;
- 	}
- 	rcu_read_unlock();
- 
- 	pr_info("Task in %s killed", memcg_name);
+ 	pr_info("Task in ");
+ 	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ 	pr_info(" killed as a result of limit of ");
+ 	pr_cont_cgroup_path(memcg->css.cgroup);
+ 	pr_info("\n");
  
- 	rcu_read_lock();
- 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
- 	if (ret < 0) {
- 		rcu_read_unlock();
- 		goto done;
- 	}
  	rcu_read_unlock();
  
- 	/*
- 	 * Continues from above, so we don't need an KERN_ level
- 	 */
- 	pr_cont(" as a result of limit of %s\n", memcg_name);
- done:
- 
  	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
  		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,