From: Linus Torvalds Date: Thu, 3 Apr 2014 20:05:42 +0000 (-0700) Subject: Merge branch 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup X-Git-Tag: v3.15-rc1~109 X-Git-Url: http://git.samba.org/samba.git/?p=sfrench%2Fcifs-2.6.git;a=commitdiff_plain;h=32d01dc7be4e725ab85ce1d74e8f4adc02ad68dd Merge branch 'for-3.15' of git://git./linux/kernel/git/tj/cgroup Pull cgroup updates from Tejun Heo: "A lot updates for cgroup: - The biggest one is cgroup's conversion to kernfs. cgroup took after the long abandoned vfs-entangled sysfs implementation and made it even more convoluted over time. cgroup's internal objects were fused with vfs objects which also brought in vfs locking and object lifetime rules. Naturally, there are places where vfs rules don't fit and nasty hacks, such as credential switching or lock dance interleaving inode mutex and cgroup_mutex with object serial number comparison thrown in to decide whether the operation is actually necessary, needed to be employed. After conversion to kernfs, internal object lifetime and locking rules are mostly isolated from vfs interactions allowing shedding of several nasty hacks and overall simplification. This will also allow implmentation of operations which may affect multiple cgroups which weren't possible before as it would have required nesting i_mutexes. - Various simplifications including dropping of module support, easier cgroup name/path handling, simplified cgroup file type handling and task_cg_lists optimization. - Prepatory changes for the planned unified hierarchy, which is still a patchset away from being actually operational. The dummy hierarchy is updated to serve as the default unified hierarchy. Controllers which aren't claimed by other hierarchies are associated with it, which BTW was what the dummy hierarchy was for anyway. - Various fixes from Li and others. This pull request includes some patches to add missing slab.h to various subsystems. This was triggered xattr.h include removal from cgroup.h. cgroup.h indirectly got included a lot of files which brought in xattr.h which brought in slab.h. There are several merge commits - one to pull in kernfs updates necessary for converting cgroup (already in upstream through driver-core), others for interfering changes in the fixes branch" * 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (74 commits) cgroup: remove useless argument from cgroup_exit() cgroup: fix spurious lockdep warning in cgroup_exit() cgroup: Use RCU_INIT_POINTER(x, NULL) in cgroup.c cgroup: break kernfs active_ref protection in cgroup directory operations cgroup: fix cgroup_taskset walking order cgroup: implement CFTYPE_ONLY_ON_DFL cgroup: make cgrp_dfl_root mountable cgroup: drop const from @buffer of cftype->write_string() cgroup: rename cgroup_dummy_root and related names cgroup: move ->subsys_mask from cgroupfs_root to cgroup cgroup: treat cgroup_dummy_root as an equivalent hierarchy during rebinding cgroup: remove NULL checks from [pr_cont_]cgroup_{name|path}() cgroup: use cgroup_setup_root() to initialize cgroup_dummy_root cgroup: reorganize cgroup bootstrapping cgroup: relocate setting of CGRP_DEAD cpuset: use rcu_read_lock() to protect task_cs() cgroup_freezer: document freezer_fork() subtleties cgroup: update cgroup_transfer_tasks() to either succeed or fail cgroup: drop task_lock() protection around task->cgroups cgroup: update how a newly forked task gets associated with css_set ... --- 32d01dc7be4e725ab85ce1d74e8f4adc02ad68dd diff --cc kernel/cgroup.c index 0c753ddd223b,f7f94322d312..fede3d3f28ff --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@@ -1658,80 -1550,72 +1550,72 @@@ retry pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - } - - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); - - rm_base_files: - free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); - unlock_drop: - cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ERR_PTR(ret); - } - static void cgroup_kill_sb(struct super_block *sb) - { - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgrp_cset_link *link, *tmp_link; - int ret; + /* + * A root's lifetime is governed by its root cgroup. Zero + * ref indicate that the root is being destroyed. Wait for + * destruction to complete so that the subsystems are free. + * We can use wait_queue for the wait but this path is + * super cold. Let's just sleep for a bit and retry. + */ + if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + kfree(opts.release_agent); + kfree(opts.name); + msleep(10); + goto retry; + } - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); - - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - } + ret = 0; + goto out_unlock; + } /* - * Release all the links from cset_links to this hierarchy's - * root cgroup + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. */ - write_lock(&css_set_lock); - - list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { - list_del(&link->cset_link); - list_del(&link->cgrp_link); - kfree(link); + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; } - write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; } - cgroup_exit_root_id(root); + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); - mutex_unlock(&cgroup_root_mutex); + out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); - simple_xattrs_free(&cgrp->xattrs); + kfree(opts.release_agent); + kfree(opts.name); - kill_litter_super(sb); - cgroup_free_root(root); + if (ret) + return ERR_PTR(ret); + - dentry = kernfs_mount(fs_type, flags, root->kf_root); ++ dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL); + if (IS_ERR(dentry)) + cgroup_put(&root->cgrp); + return dentry; + } + + static void cgroup_kill_sb(struct super_block *sb) + { + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + + cgroup_put(&root->cgrp); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@@ -4116,17 -3629,19 +3629,19 @@@ static int create_css(struct cgroup *cg init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free; + goto err_free_percpu_ref; err = online_css(css); if (err) - goto err_free; + goto err_clear_dir; - dget(cgrp->dentry); + cgroup_get(cgrp); css_get(css->parent); + cgrp->subsys_mask |= 1 << ss->id; + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@@ -4138,11 -3653,8 +3653,11 @@@ return 0; -err_free: +err_clear_dir: - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); ++ cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); +err_free_css: ss->css_free(css); return err; } diff --cc kernel/cpuset.c index e6b1b66afe52,efbf9baf77ec..e2dbb60004d4 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@@ -2480,11 -2414,11 +2410,11 @@@ int __cpuset_node_allowed_softwall(int /* Not hardwall and node outside mems_allowed: scan up cpusets */ mutex_lock(&callback_mutex); - task_lock(current); + rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); - task_unlock(current); + rcu_read_unlock(); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } diff --cc mm/memcontrol.c index 5b6b0039f725,96f94a9f2faf..dcc8153a1681 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -1683,54 -1683,25 +1683,25 @@@ static void move_unlock_mem_cgroup(stru */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* - * protects memcg_name and makes sure that parallel ooms do not - * interleave - */ + /* oom_info_lock ensures that parallel ooms do not interleave */ - static DEFINE_SPINLOCK(oom_info_lock); + static DEFINE_MUTEX(oom_info_lock); - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - static char memcg_name[PATH_MAX]; - int ret; struct mem_cgroup *iter; unsigned int i; if (!p) return; - spin_lock(&oom_info_lock); + mutex_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); - - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - pr_info("Task in %s killed", memcg_name); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - /* - * Continues from above, so we don't need an KERN_ level - */ - pr_cont(" as a result of limit of %s\n", memcg_name); - done: - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,