Merge tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / kernel / sched / psi.c
index db27b69fa92a0756a0fe06d9b98f1f09573f66f7..cc25a3cff41fb17f98d67430be27359d297aba0f 100644 (file)
@@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
  */
 void cgroup_move_task(struct task_struct *task, struct css_set *to)
 {
-       unsigned int task_flags = 0;
+       unsigned int task_flags;
        struct rq_flags rf;
        struct rq *rq;
 
@@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 
        rq = task_rq_lock(task, &rf);
 
-       if (task_on_rq_queued(task)) {
-               task_flags = TSK_RUNNING;
-               if (task_current(rq, task))
-                       task_flags |= TSK_ONCPU;
-       } else if (task->in_iowait)
-               task_flags = TSK_IOWAIT;
-
-       if (task->in_memstall)
-               task_flags |= TSK_MEMSTALL;
+       /*
+        * We may race with schedule() dropping the rq lock between
+        * deactivating prev and switching to next. Because the psi
+        * updates from the deactivation are deferred to the switch
+        * callback to save cgroup tree updates, the task's scheduling
+        * state here is not coherent with its psi state:
+        *
+        * schedule()                   cgroup_move_task()
+        *   rq_lock()
+        *   deactivate_task()
+        *     p->on_rq = 0
+        *     psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+        *   pick_next_task()
+        *     rq_unlock()
+        *                                rq_lock()
+        *                                psi_task_change() // old cgroup
+        *                                task->cgroups = to
+        *                                psi_task_change() // new cgroup
+        *                                rq_unlock()
+        *     rq_lock()
+        *   psi_sched_switch() // does deferred updates in new cgroup
+        *
+        * Don't rely on the scheduling state. Use psi_flags instead.
+        */
+       task_flags = task->psi_flags;
 
        if (task_flags)
                psi_task_change(task, task_flags, 0);