Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
[sfrench/cifs-2.6.git] / kernel / cgroup / cgroup.c
index 9cc8c3a686b12a2b843f6e5e5f49126d062c9601..ea5b0f01d2b3e8b75f910edeb1ea078156c926b3 100644 (file)
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
 
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+       OPT_FEATURE_PRESSURE,
+#endif
+       OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+       "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
+
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
@@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 }
 
 /**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
  * @mgctx: migration context
  *
  * Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
        psi_trigger_replace(&of->priv, NULL);
 }
+
+bool cgroup_psi_enabled(void)
+{
+       return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+       return false;
+}
+
 #endif /* CONFIG_PSI */
 
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
        return nbytes;
 }
 
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+       struct css_task_iter it;
+       struct task_struct *task;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       spin_lock_irq(&css_set_lock);
+       set_bit(CGRP_KILL, &cgrp->flags);
+       spin_unlock_irq(&css_set_lock);
+
+       css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+       while ((task = css_task_iter_next(&it))) {
+               /* Ignore kernel threads here. */
+               if (task->flags & PF_KTHREAD)
+                       continue;
+
+               /* Skip tasks that are already dying. */
+               if (__fatal_signal_pending(task))
+                       continue;
+
+               send_sig(SIGKILL, task, 0);
+       }
+       css_task_iter_end(&it);
+
+       spin_lock_irq(&css_set_lock);
+       clear_bit(CGRP_KILL, &cgrp->flags);
+       spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+       struct cgroup_subsys_state *css;
+       struct cgroup *dsct;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+               __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+                                size_t nbytes, loff_t off)
+{
+       ssize_t ret = 0;
+       int kill;
+       struct cgroup *cgrp;
+
+       ret = kstrtoint(strstrip(buf), 0, &kill);
+       if (ret)
+               return ret;
+
+       if (kill != 1)
+               return -ERANGE;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+
+       /*
+        * Killing is a process directed operation, i.e. the whole thread-group
+        * is taken down so act like we do for cgroup.procs and only make this
+        * writable in non-threaded cgroups.
+        */
+       if (cgroup_is_threaded(cgrp))
+               ret = -EOPNOTSUPP;
+       else
+               cgroup_kill(cgrp);
+
+       cgroup_kn_unlock(of->kn);
+
+       return ret ?: nbytes;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
        struct cftype *cft = of_cft(of);
@@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 restart:
        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
+               if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+                       continue;
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 
                WARN_ON(cft->ss || cft->kf_ops);
 
+               if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+                       continue;
+
                if (cft->seq_start)
                        kf_ops = &cgroup_kf_ops;
                else
@@ -4860,6 +4967,11 @@ static struct cftype cgroup_base_files[] = {
                .seq_show = cgroup_freeze_show,
                .write = cgroup_freeze_write,
        },
+       {
+               .name = "cgroup.kill",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .write = cgroup_kill_write,
+       },
        {
                .name = "cpu.stat",
                .seq_show = cpu_stat_show,
@@ -4867,6 +4979,7 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
        {
                .name = "io.pressure",
+               .flags = CFTYPE_PRESSURE,
                .seq_show = cgroup_io_pressure_show,
                .write = cgroup_io_pressure_write,
                .poll = cgroup_pressure_poll,
@@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
        },
        {
                .name = "memory.pressure",
+               .flags = CFTYPE_PRESSURE,
                .seq_show = cgroup_memory_pressure_show,
                .write = cgroup_memory_pressure_write,
                .poll = cgroup_pressure_poll,
@@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
        },
        {
                .name = "cpu.pressure",
+               .flags = CFTYPE_PRESSURE,
                .seq_show = cgroup_cpu_pressure_show,
                .write = cgroup_cpu_pressure_write,
                .poll = cgroup_pressure_poll,
@@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
                      struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
 {
+       unsigned long cgrp_flags = 0;
+       bool kill = false;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;
@@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,
 
        /* init tasks are special, only link regular threads */
        if (likely(child->pid)) {
+               if (kargs->cgrp)
+                       cgrp_flags = kargs->cgrp->flags;
+               else
+                       cgrp_flags = cset->dfl_cgrp->flags;
+
                WARN_ON_ONCE(!list_empty(&child->cg_list));
                cset->nr_tasks++;
                css_set_move_task(child, NULL, cset, false);
@@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child,
                cset = NULL;
        }
 
-       /*
-        * If the cgroup has to be frozen, the new task has too.  Let's set
-        * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
-        * frozen state.
-        */
-       if (unlikely(cgroup_task_freeze(child))) {
-               spin_lock(&child->sighand->siglock);
-               WARN_ON_ONCE(child->frozen);
-               child->jobctl |= JOBCTL_TRAP_FREEZE;
-               spin_unlock(&child->sighand->siglock);
+       if (!(child->flags & PF_KTHREAD)) {
+               if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+                       /*
+                        * If the cgroup has to be frozen, the new task has
+                        * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+                        * get the task into the frozen state.
+                        */
+                       spin_lock(&child->sighand->siglock);
+                       WARN_ON_ONCE(child->frozen);
+                       child->jobctl |= JOBCTL_TRAP_FREEZE;
+                       spin_unlock(&child->sighand->siglock);
+
+                       /*
+                        * Calling cgroup_update_frozen() isn't required here,
+                        * because it will be called anyway a bit later from
+                        * do_freezer_trap(). So we avoid cgroup's transient
+                        * switch from the frozen state and back.
+                        */
+               }
 
                /*
-                * Calling cgroup_update_frozen() isn't required here,
-                * because it will be called anyway a bit later from
-                * do_freezer_trap(). So we avoid cgroup's transient switch
-                * from the frozen state and back.
+                * If the cgroup is to be killed notice it now and take the
+                * child down right after we finished preparing it for
+                * userspace.
                 */
+               kill = test_bit(CGRP_KILL, &cgrp_flags);
        }
 
        spin_unlock_irq(&css_set_lock);
@@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
                put_css_set(rcset);
        }
 
+       /* Cgroup has to be killed so take down child immediately. */
+       if (unlikely(kill))
+               do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
        cgroup_css_set_put_fork(kargs);
 }
 
@@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
        cset->nr_tasks--;
 
        WARN_ON_ONCE(cgroup_task_frozen(tsk));
-       if (unlikely(cgroup_task_freeze(tsk)))
+       if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+                    test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
                cgroup_update_frozen(task_dfl_cgroup(tsk));
 
        spin_unlock_irq(&css_set_lock);
@@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
                        pr_info("Disabling %s control group subsystem\n",
                                ss->name);
                }
+
+               for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+                       if (strcmp(token, cgroup_opt_feature_names[i]))
+                               continue;
+                       cgroup_feature_disable_mask |= 1 << i;
+                       pr_info("Disabling %s control group feature\n",
+                               cgroup_opt_feature_names[i]);
+                       break;
+               }
        }
        return 1;
 }
@@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
                if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
                        continue;
 
+               if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+                       continue;
+
                if (prefix)
                        ret += snprintf(buf + ret, size - ret, "%s.", prefix);