psi: Optimize switching tasks inside shared cgroups
[sfrench/cifs-2.6.git] / kernel / sched / psi.c
index 50128297a4f9ff2b319e89849f5a4c6d48eb2a86..955a124bae817156fa06af439f776c1ef3af3024 100644 (file)
@@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
                groupc->times[PSI_NONIDLE] += delta;
 }
 
-static u32 psi_group_change(struct psi_group *group, int cpu,
-                           unsigned int clear, unsigned int set)
+static void psi_group_change(struct psi_group *group, int cpu,
+                            unsigned int clear, unsigned int set,
+                            bool wake_clock)
 {
        struct psi_group_cpu *groupc;
+       u32 state_mask = 0;
        unsigned int t, m;
        enum psi_states s;
-       u32 state_mask = 0;
 
        groupc = per_cpu_ptr(group->pcpu, cpu);
 
@@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
 
        write_seqcount_end(&groupc->seq);
 
-       return state_mask;
+       if (state_mask & group->poll_states)
+               psi_schedule_poll_work(group, 1);
+
+       if (wake_clock && !delayed_work_pending(&group->avgs_work))
+               schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 }
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
        return &psi_system;
 }
 
-void psi_task_change(struct task_struct *task, int clear, int set)
+static void psi_flags_change(struct task_struct *task, int clear, int set)
 {
-       int cpu = task_cpu(task);
-       struct psi_group *group;
-       bool wake_clock = true;
-       void *iter = NULL;
-
-       if (!task->pid)
-               return;
-
        if (((task->psi_flags & set) ||
             (task->psi_flags & clear) != clear) &&
            !psi_bug) {
                printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
-                               task->pid, task->comm, cpu,
+                               task->pid, task->comm, task_cpu(task),
                                task->psi_flags, clear, set);
                psi_bug = 1;
        }
 
        task->psi_flags &= ~clear;
        task->psi_flags |= set;
+}
+
+void psi_task_change(struct task_struct *task, int clear, int set)
+{
+       int cpu = task_cpu(task);
+       struct psi_group *group;
+       bool wake_clock = true;
+       void *iter = NULL;
+
+       if (!task->pid)
+               return;
+
+       psi_flags_change(task, clear, set);
 
        /*
         * Periodic aggregation shuts off if there is a period of no
@@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set)
                     wq_worker_last_func(task) == psi_avgs_work))
                wake_clock = false;
 
-       while ((group = iterate_groups(task, &iter))) {
-               u32 state_mask = psi_group_change(group, cpu, clear, set);
+       while ((group = iterate_groups(task, &iter)))
+               psi_group_change(group, cpu, clear, set, wake_clock);
+}
+
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+                    bool sleep)
+{
+       struct psi_group *group, *common = NULL;
+       int cpu = task_cpu(prev);
+       void *iter;
+
+       if (next->pid) {
+               psi_flags_change(next, 0, TSK_ONCPU);
+               /*
+                * When moving state between tasks, the group that
+                * contains them both does not change: we can stop
+                * updating the tree once we reach the first common
+                * ancestor. Iterate @next's ancestors until we
+                * encounter @prev's state.
+                */
+               iter = NULL;
+               while ((group = iterate_groups(next, &iter))) {
+                       if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+                               common = group;
+                               break;
+                       }
+
+                       psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+               }
+       }
+
+       /*
+        * If this is a voluntary sleep, dequeue will have taken care
+        * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
+        * only need to deal with it during preemption.
+        */
+       if (sleep)
+               return;
 
-               if (state_mask & group->poll_states)
-                       psi_schedule_poll_work(group, 1);
+       if (prev->pid) {
+               psi_flags_change(prev, TSK_ONCPU, 0);
 
-               if (wake_clock && !delayed_work_pending(&group->avgs_work))
-                       schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+               iter = NULL;
+               while ((group = iterate_groups(prev, &iter)) && group != common)
+                       psi_group_change(group, cpu, TSK_ONCPU, 0, true);
        }
 }