psi: get poll_work to run when calling poll syscall next time

[sfrench/cifs-2.6.git] / kernel / sched / psi.c
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 0e97ca9306efc164ada86b6c6bd73506bfdb706e..6e52b67b420e7a3312f463f8d3bb6baad6576041 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
   * Copyright (c) 2018 Facebook, Inc.
   * Author: Johannes Weiner <hannes@cmpxchg.org>
   *
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
   * When CPU, memory and IO are contended, tasks experience delays that
   * reduce throughput and introduce latencies into the workload. Memory
   * and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
  #include <linux/seqlock.h>
+#include <linux/uaccess.h>
  #include <linux/cgroup.h>
  #include <linux/module.h>
  #include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
  #include <linux/psi.h>
  #include "sched.h"
  
@@ -140,9 +147,9 @@ static int psi_bug __read_mostly;
  DEFINE_STATIC_KEY_FALSE(psi_disabled);
  
  #ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
  #else
-bool psi_enable = true;
+static bool psi_enable = true;
  #endif
  static int __init setup_psi(char *str)
  {
@@ -156,16 +163,21 @@ __setup("psi=", setup_psi);
  #define EXP_60s                1981            /* 1/exp(2s/60s) */
  #define EXP_300s       2034            /* 1/exp(2s/300s) */
  
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000   /* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10  /* 10 updates per window */
+
  /* Sampling frequency in nanoseconds */
  static u64 psi_period __read_mostly;
  
  /* System-level pressure and stall tracking */
  static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
-static struct psi_group psi_system = {
+struct psi_group psi_system = {
         .pcpu = &system_group_pcpu,
  };
  
-static void psi_update_work(struct work_struct *work);
+static void psi_avgs_work(struct work_struct *work);
  
  static void group_init(struct psi_group *group)
  {
@@ -173,9 +185,20 @@ static void group_init(struct psi_group *group)
  
         for_each_possible_cpu(cpu)
                 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
-       group->next_update = sched_clock() + psi_period;
-       INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
-       mutex_init(&group->stat_lock);
+       group->avg_next_update = sched_clock() + psi_period;
+       INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+       mutex_init(&group->avgs_lock);
+       /* Init trigger-related members */
+       atomic_set(&group->poll_scheduled, 0);
+       mutex_init(&group->trigger_lock);
+       INIT_LIST_HEAD(&group->triggers);
+       memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+       group->poll_states = 0;
+       group->poll_min_period = U32_MAX;
+       memset(group->polling_total, 0, sizeof(group->polling_total));
+       group->polling_next_update = ULLONG_MAX;
+       group->polling_until = 0;
+       rcu_assign_pointer(group->poll_kworker, NULL);
  }
  
  void __init psi_init(void)
@@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
         }
  }
  
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu,
+                            enum psi_aggregators aggregator, u32 *times,
+                            u32 *pchanged_states)
  {
         struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
-       unsigned int tasks[NR_PSI_TASK_COUNTS];
         u64 now, state_start;
+       enum psi_states s;
         unsigned int seq;
-       int s;
+       u32 state_mask;
+
+       *pchanged_states = 0;
  
         /* Snapshot a coherent view of the CPU state */
         do {
                 seq = read_seqcount_begin(&groupc->seq);
                 now = cpu_clock(cpu);
                 memcpy(times, groupc->times, sizeof(groupc->times));
-               memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+               state_mask = groupc->state_mask;
                 state_start = groupc->state_start;
         } while (read_seqcount_retry(&groupc->seq, seq));
  
@@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
                  * (u32) and our reported pressure close to what's
                  * actually happening.
                  */
-               if (test_state(tasks, s))
+               if (state_mask & (1 << s))
                         times[s] += now - state_start;
  
-               delta = times[s] - groupc->times_prev[s];
-               groupc->times_prev[s] = times[s];
+               delta = times[s] - groupc->times_prev[aggregator][s];
+               groupc->times_prev[aggregator][s] = times[s];
  
                 times[s] = delta;
+               if (delta)
+                       *pchanged_states |= (1 << s);
         }
  }
  
@@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
         avg[2] = calc_load(avg[2], EXP_300s, pct);
  }
  
-static bool update_stats(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group,
+                                enum psi_aggregators aggregator,
+                                u32 *pchanged_states)
  {
         u64 deltas[NR_PSI_STATES - 1] = { 0, };
-       unsigned long missed_periods = 0;
         unsigned long nonidle_total = 0;
-       u64 now, expires, period;
+       u32 changed_states = 0;
         int cpu;
         int s;
  
-       mutex_lock(&group->stat_lock);
-
         /*
          * Collect the per-cpu time buckets and average them into a
          * single time sample that is normalized to wallclock time.
@@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group)
         for_each_possible_cpu(cpu) {
                 u32 times[NR_PSI_STATES];
                 u32 nonidle;
+               u32 cpu_changed_states;
  
-               get_recent_times(group, cpu, times);
+               get_recent_times(group, cpu, aggregator, times,
+                               &cpu_changed_states);
+               changed_states |= cpu_changed_states;
  
                 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
                 nonidle_total += nonidle;
@@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group)
  
         /* total= */
         for (s = 0; s < NR_PSI_STATES - 1; s++)
-               group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+               group->total[aggregator][s] +=
+                               div_u64(deltas[s], max(nonidle_total, 1UL));
+
+       if (pchanged_states)
+               *pchanged_states = changed_states;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+       unsigned long missed_periods = 0;
+       u64 expires, period;
+       u64 avg_next_update;
+       int s;
  
         /* avgX= */
-       now = sched_clock();
-       expires = group->next_update;
-       if (now < expires)
-               goto out;
+       expires = group->avg_next_update;
         if (now - expires >= psi_period)
                 missed_periods = div_u64(now - expires, psi_period);
  
@@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group)
          * But the deltas we sample out of the per-cpu buckets above
          * are based on the actual time elapsing between clock ticks.
          */
-       group->next_update = expires + ((1 + missed_periods) * psi_period);
-       period = now - (group->last_update + (missed_periods * psi_period));
-       group->last_update = now;
+       avg_next_update = expires + ((1 + missed_periods) * psi_period);
+       period = now - (group->avg_last_update + (missed_periods * psi_period));
+       group->avg_last_update = now;
  
         for (s = 0; s < NR_PSI_STATES - 1; s++) {
                 u32 sample;
  
-               sample = group->total[s] - group->total_prev[s];
+               sample = group->total[PSI_AVGS][s] - group->avg_total[s];
                 /*
                  * Due to the lockless sampling of the time buckets,
                  * recorded time deltas can slip into the next period,
@@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group)
                  */
                 if (sample > period)
                         sample = period;
-               group->total_prev[s] += sample;
+               group->avg_total[s] += sample;
                 calc_avgs(group->avg[s], missed_periods, sample, period);
         }
-out:
-       mutex_unlock(&group->stat_lock);
-       return nonidle_total;
+
+       return avg_next_update;
  }
  
-static void psi_update_work(struct work_struct *work)
+static void psi_avgs_work(struct work_struct *work)
  {
         struct delayed_work *dwork;
         struct psi_group *group;
+       u32 changed_states;
         bool nonidle;
+       u64 now;
  
         dwork = to_delayed_work(work);
-       group = container_of(dwork, struct psi_group, clock_work);
+       group = container_of(dwork, struct psi_group, avgs_work);
+
+       mutex_lock(&group->avgs_lock);
  
+       now = sched_clock();
+
+       collect_percpu_times(group, PSI_AVGS, &changed_states);
+       nonidle = changed_states & (1 << PSI_NONIDLE);
         /*
          * If there is task activity, periodically fold the per-cpu
          * times and feed samples into the running averages. If things
@@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work)
          * Once restarted, we'll catch up the running averages in one
          * go - see calc_avgs() and missed_periods.
          */
-
-       nonidle = update_stats(group);
+       if (now >= group->avg_next_update)
+               group->avg_next_update = update_averages(group, now);
  
         if (nonidle) {
-               unsigned long delay = 0;
-               u64 now;
+               schedule_delayed_work(dwork, nsecs_to_jiffies(
+                               group->avg_next_update - now) + 1);
+       }
+
+       mutex_unlock(&group->avgs_lock);
+}
+
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+                        u64 prev_growth)
+{
+       win->start_time = now;
+       win->start_value = value;
+       win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+       u64 elapsed;
+       u64 growth;
+
+       elapsed = now - win->start_time;
+       growth = value - win->start_value;
+       /*
+        * After each tracking window passes win->start_value and
+        * win->start_time get reset and win->prev_growth stores
+        * the average per-window growth of the previous window.
+        * win->prev_growth is then used to interpolate additional
+        * growth from the previous window assuming it was linear.
+        */
+       if (elapsed > win->size)
+               window_reset(win, now, value, growth);
+       else {
+               u32 remaining;
+
+               remaining = win->size - elapsed;
+               growth += div_u64(win->prev_growth * remaining, win->size);
+       }
+
+       return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+       struct psi_trigger *t;
+
+       list_for_each_entry(t, &group->triggers, node)
+               window_reset(&t->win, now,
+                               group->total[PSI_POLL][t->state], 0);
+       memcpy(group->polling_total, group->total[PSI_POLL],
+                  sizeof(group->polling_total));
+       group->polling_next_update = now + group->poll_min_period;
+}
+
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+       struct psi_trigger *t;
+       bool new_stall = false;
+       u64 *total = group->total[PSI_POLL];
+
+       /*
+        * On subsequent updates, calculate growth deltas and let
+        * watchers know when their specified thresholds are exceeded.
+        */
+       list_for_each_entry(t, &group->triggers, node) {
+               u64 growth;
+
+               /* Check for stall activity */
+               if (group->polling_total[t->state] == total[t->state])
+                       continue;
+
+               /*
+                * Multiple triggers might be looking at the same state,
+                * remember to update group->polling_total[] once we've
+                * been through all of them. Also remember to extend the
+                * polling time if we see new stall activity.
+                */
+               new_stall = true;
  
-               now = sched_clock();
-               if (group->next_update > now)
-                       delay = nsecs_to_jiffies(group->next_update - now) + 1;
-               schedule_delayed_work(dwork, delay);
+               /* Calculate growth since last update */
+               growth = window_update(&t->win, now, total[t->state]);
+               if (growth < t->threshold)
+                       continue;
+
+               /* Limit event signaling to once per window */
+               if (now < t->last_event_time + t->win.size)
+                       continue;
+
+               /* Generate an event */
+               if (cmpxchg(&t->event, 0, 1) == 0)
+                       wake_up_interruptible(&t->event_wait);
+               t->last_event_time = now;
         }
+
+       if (new_stall)
+               memcpy(group->polling_total, total,
+                               sizeof(group->polling_total));
+
+       return now + group->poll_min_period;
+}
+
+/*
+ * Schedule polling if it's not already scheduled. It's safe to call even from
+ * hotpath because even though kthread_queue_delayed_work takes worker->lock
+ * spinlock that spinlock is never contended due to poll_scheduled atomic
+ * preventing such competition.
+ */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+{
+       struct kthread_worker *kworker;
+
+       /* Do not reschedule if already scheduled */
+       if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+               return;
+
+       rcu_read_lock();
+
+       kworker = rcu_dereference(group->poll_kworker);
+       /*
+        * kworker might be NULL in case psi_trigger_destroy races with
+        * psi_task_change (hotpath) which can't use locks
+        */
+       if (likely(kworker))
+               kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+       else
+               atomic_set(&group->poll_scheduled, 0);
+
+       rcu_read_unlock();
+}
+
+static void psi_poll_work(struct kthread_work *work)
+{
+       struct kthread_delayed_work *dwork;
+       struct psi_group *group;
+       u32 changed_states;
+       u64 now;
+
+       dwork = container_of(work, struct kthread_delayed_work, work);
+       group = container_of(dwork, struct psi_group, poll_work);
+
+       atomic_set(&group->poll_scheduled, 0);
+
+       mutex_lock(&group->trigger_lock);
+
+       now = sched_clock();
+
+       collect_percpu_times(group, PSI_POLL, &changed_states);
+
+       if (changed_states & group->poll_states) {
+               /* Initialize trigger windows when entering polling mode */
+               if (now > group->polling_until)
+                       init_triggers(group, now);
+
+               /*
+                * Keep the monitor active for at least the duration of the
+                * minimum tracking window as long as monitor states are
+                * changing.
+                */
+               group->polling_until = now +
+                       group->poll_min_period * UPDATES_PER_WINDOW;
+       }
+
+       if (now > group->polling_until) {
+               group->polling_next_update = ULLONG_MAX;
+               goto out;
+       }
+
+       if (now >= group->polling_next_update)
+               group->polling_next_update = update_triggers(group, now);
+
+       psi_schedule_poll_work(group,
+               nsecs_to_jiffies(group->polling_next_update - now) + 1);
+
+out:
+       mutex_unlock(&group->trigger_lock);
  }
  
  static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
         delta = now - groupc->state_start;
         groupc->state_start = now;
  
-       if (test_state(groupc->tasks, PSI_IO_SOME)) {
+       if (groupc->state_mask & (1 << PSI_IO_SOME)) {
                 groupc->times[PSI_IO_SOME] += delta;
-               if (test_state(groupc->tasks, PSI_IO_FULL))
+               if (groupc->state_mask & (1 << PSI_IO_FULL))
                         groupc->times[PSI_IO_FULL] += delta;
         }
  
-       if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+       if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
                 groupc->times[PSI_MEM_SOME] += delta;
-               if (test_state(groupc->tasks, PSI_MEM_FULL))
+               if (groupc->state_mask & (1 << PSI_MEM_FULL))
                         groupc->times[PSI_MEM_FULL] += delta;
                 else if (memstall_tick) {
                         u32 sample;
@@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
                 }
         }
  
-       if (test_state(groupc->tasks, PSI_CPU_SOME))
+       if (groupc->state_mask & (1 << PSI_CPU_SOME))
                 groupc->times[PSI_CPU_SOME] += delta;
  
-       if (test_state(groupc->tasks, PSI_NONIDLE))
+       if (groupc->state_mask & (1 << PSI_NONIDLE))
                 groupc->times[PSI_NONIDLE] += delta;
  }
  
-static void psi_group_change(struct psi_group *group, int cpu,
-                            unsigned int clear, unsigned int set)
+static u32 psi_group_change(struct psi_group *group, int cpu,
+                           unsigned int clear, unsigned int set)
  {
         struct psi_group_cpu *groupc;
         unsigned int t, m;
+       enum psi_states s;
+       u32 state_mask = 0;
  
         groupc = per_cpu_ptr(group->pcpu, cpu);
  
@@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 if (set & (1 << t))
                         groupc->tasks[t]++;
  
+       /* Calculate state mask representing active states */
+       for (s = 0; s < NR_PSI_STATES; s++) {
+               if (test_state(groupc->tasks, s))
+                       state_mask |= (1 << s);
+       }
+       groupc->state_mask = state_mask;
+
         write_seqcount_end(&groupc->seq);
+
+       return state_mask;
  }
  
  static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
          */
         if (unlikely((clear & TSK_RUNNING) &&
                      (task->flags & PF_WQ_WORKER) &&
-                    wq_worker_last_func(task) == psi_update_work))
+                    wq_worker_last_func(task) == psi_avgs_work))
                 wake_clock = false;
  
         while ((group = iterate_groups(task, &iter))) {
-               psi_group_change(group, cpu, clear, set);
-               if (wake_clock && !delayed_work_pending(&group->clock_work))
-                       schedule_delayed_work(&group->clock_work, PSI_FREQ);
+               u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+               if (state_mask & group->poll_states)
+                       psi_schedule_poll_work(group, 1);
+
+               if (wake_clock && !delayed_work_pending(&group->avgs_work))
+                       schedule_delayed_work(&group->avgs_work, PSI_FREQ);
         }
  }
  
@@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup)
         if (static_branch_likely(&psi_disabled))
                 return;
  
-       cancel_delayed_work_sync(&cgroup->psi.clock_work);
+       cancel_delayed_work_sync(&cgroup->psi.avgs_work);
         free_percpu(cgroup->psi.pcpu);
+       /* All triggers must be removed by now */
+       WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
  }
  
  /**
@@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
  int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
  {
         int full;
+       u64 now;
  
         if (static_branch_likely(&psi_disabled))
                 return -EOPNOTSUPP;
  
-       update_stats(group);
+       /* Update averages before reporting them */
+       mutex_lock(&group->avgs_lock);
+       now = sched_clock();
+       collect_percpu_times(group, PSI_AVGS, NULL);
+       if (now >= group->avg_next_update)
+               group->avg_next_update = update_averages(group, now);
+       mutex_unlock(&group->avgs_lock);
  
         for (full = 0; full < 2 - (res == PSI_CPU); full++) {
                 unsigned long avg[3];
@@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
  
                 for (w = 0; w < 3; w++)
                         avg[w] = group->avg[res * 2 + full][w];
-               total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+               total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+                               NSEC_PER_USEC);
  
                 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
                            full ? "full" : "some",
@@ -753,25 +1003,278 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
         return single_open(file, psi_cpu_show, NULL);
  }
  
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+                       char *buf, size_t nbytes, enum psi_res res)
+{
+       struct psi_trigger *t;
+       enum psi_states state;
+       u32 threshold_us;
+       u32 window_us;
+
+       if (static_branch_likely(&psi_disabled))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+               state = PSI_IO_SOME + res * 2;
+       else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+               state = PSI_IO_FULL + res * 2;
+       else
+               return ERR_PTR(-EINVAL);
+
+       if (state >= PSI_NONIDLE)
+               return ERR_PTR(-EINVAL);
+
+       if (window_us < WINDOW_MIN_US ||
+               window_us > WINDOW_MAX_US)
+               return ERR_PTR(-EINVAL);
+
+       /* Check threshold */
+       if (threshold_us == 0 || threshold_us > window_us)
+               return ERR_PTR(-EINVAL);
+
+       t = kmalloc(sizeof(*t), GFP_KERNEL);
+       if (!t)
+               return ERR_PTR(-ENOMEM);
+
+       t->group = group;
+       t->state = state;
+       t->threshold = threshold_us * NSEC_PER_USEC;
+       t->win.size = window_us * NSEC_PER_USEC;
+       window_reset(&t->win, 0, 0, 0);
+
+       t->event = 0;
+       t->last_event_time = 0;
+       init_waitqueue_head(&t->event_wait);
+       kref_init(&t->refcount);
+
+       mutex_lock(&group->trigger_lock);
+
+       if (!rcu_access_pointer(group->poll_kworker)) {
+               struct sched_param param = {
+                       .sched_priority = 1,
+               };
+               struct kthread_worker *kworker;
+
+               kworker = kthread_create_worker(0, "psimon");
+               if (IS_ERR(kworker)) {
+                       kfree(t);
+                       mutex_unlock(&group->trigger_lock);
+                       return ERR_CAST(kworker);
+               }
+               sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+               kthread_init_delayed_work(&group->poll_work,
+                               psi_poll_work);
+               rcu_assign_pointer(group->poll_kworker, kworker);
+       }
+
+       list_add(&t->node, &group->triggers);
+       group->poll_min_period = min(group->poll_min_period,
+               div_u64(t->win.size, UPDATES_PER_WINDOW));
+       group->nr_triggers[t->state]++;
+       group->poll_states |= (1 << t->state);
+
+       mutex_unlock(&group->trigger_lock);
+
+       return t;
+}
+
+static void psi_trigger_destroy(struct kref *ref)
+{
+       struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
+       struct psi_group *group = t->group;
+       struct kthread_worker *kworker_to_destroy = NULL;
+
+       if (static_branch_likely(&psi_disabled))
+               return;
+
+       /*
+        * Wakeup waiters to stop polling. Can happen if cgroup is deleted
+        * from under a polling process.
+        */
+       wake_up_interruptible(&t->event_wait);
+
+       mutex_lock(&group->trigger_lock);
+
+       if (!list_empty(&t->node)) {
+               struct psi_trigger *tmp;
+               u64 period = ULLONG_MAX;
+
+               list_del(&t->node);
+               group->nr_triggers[t->state]--;
+               if (!group->nr_triggers[t->state])
+                       group->poll_states &= ~(1 << t->state);
+               /* reset min update period for the remaining triggers */
+               list_for_each_entry(tmp, &group->triggers, node)
+                       period = min(period, div_u64(tmp->win.size,
+                                       UPDATES_PER_WINDOW));
+               group->poll_min_period = period;
+               /* Destroy poll_kworker when the last trigger is destroyed */
+               if (group->poll_states == 0) {
+                       group->polling_until = 0;
+                       kworker_to_destroy = rcu_dereference_protected(
+                                       group->poll_kworker,
+                                       lockdep_is_held(&group->trigger_lock));
+                       rcu_assign_pointer(group->poll_kworker, NULL);
+               }
+       }
+
+       mutex_unlock(&group->trigger_lock);
+
+       /*
+        * Wait for both *trigger_ptr from psi_trigger_replace and
+        * poll_kworker RCUs to complete their read-side critical sections
+        * before destroying the trigger and optionally the poll_kworker
+        */
+       synchronize_rcu();
+       /*
+        * Destroy the kworker after releasing trigger_lock to prevent a
+        * deadlock while waiting for psi_poll_work to acquire trigger_lock
+        */
+       if (kworker_to_destroy) {
+               /*
+                * After the RCU grace period has expired, the worker
+                * can no longer be found through group->poll_kworker.
+                * But it might have been already scheduled before
+                * that - deschedule it cleanly before destroying it.
+                */
+               kthread_cancel_delayed_work_sync(&group->poll_work);
+               atomic_set(&group->poll_scheduled, 0);
+
+               kthread_destroy_worker(kworker_to_destroy);
+       }
+       kfree(t);
+}
+
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
+{
+       struct psi_trigger *old = *trigger_ptr;
+
+       if (static_branch_likely(&psi_disabled))
+               return;
+
+       rcu_assign_pointer(*trigger_ptr, new);
+       if (old)
+               kref_put(&old->refcount, psi_trigger_destroy);
+}
+
+__poll_t psi_trigger_poll(void **trigger_ptr,
+                               struct file *file, poll_table *wait)
+{
+       __poll_t ret = DEFAULT_POLLMASK;
+       struct psi_trigger *t;
+
+       if (static_branch_likely(&psi_disabled))
+               return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+       rcu_read_lock();
+
+       t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
+       if (!t) {
+               rcu_read_unlock();
+               return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+       }
+       kref_get(&t->refcount);
+
+       rcu_read_unlock();
+
+       poll_wait(file, &t->event_wait, wait);
+
+       if (cmpxchg(&t->event, 1, 0) == 1)
+               ret |= EPOLLPRI;
+
+       kref_put(&t->refcount, psi_trigger_destroy);
+
+       return ret;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+                        size_t nbytes, enum psi_res res)
+{
+       char buf[32];
+       size_t buf_size;
+       struct seq_file *seq;
+       struct psi_trigger *new;
+
+       if (static_branch_likely(&psi_disabled))
+               return -EOPNOTSUPP;
+
+       buf_size = min(nbytes, (sizeof(buf) - 1));
+       if (copy_from_user(buf, user_buf, buf_size))
+               return -EFAULT;
+
+       buf[buf_size - 1] = '\0';
+
+       new = psi_trigger_create(&psi_system, buf, nbytes, res);
+       if (IS_ERR(new))
+               return PTR_ERR(new);
+
+       seq = file->private_data;
+       /* Take seq->lock to protect seq->private from concurrent writes */
+       mutex_lock(&seq->lock);
+       psi_trigger_replace(&seq->private, new);
+       mutex_unlock(&seq->lock);
+
+       return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+                           size_t nbytes, loff_t *ppos)
+{
+       return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+                               size_t nbytes, loff_t *ppos)
+{
+       return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+                            size_t nbytes, loff_t *ppos)
+{
+       return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+       struct seq_file *seq = file->private_data;
+
+       return psi_trigger_poll(&seq->private, file, wait);
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+
+       psi_trigger_replace(&seq->private, NULL);
+       return single_release(inode, file);
+}
+
  static const struct file_operations psi_io_fops = {
         .open           = psi_io_open,
         .read           = seq_read,
         .llseek         = seq_lseek,
-       .release        = single_release,
+       .write          = psi_io_write,
+       .poll           = psi_fop_poll,
+       .release        = psi_fop_release,
  };
  
  static const struct file_operations psi_memory_fops = {
         .open           = psi_memory_open,
         .read           = seq_read,
         .llseek         = seq_lseek,
-       .release        = single_release,
+       .write          = psi_memory_write,
+       .poll           = psi_fop_poll,
+       .release        = psi_fop_release,
  };
  
  static const struct file_operations psi_cpu_fops = {
         .open           = psi_cpu_open,
         .read           = seq_read,
         .llseek         = seq_lseek,
-       .release        = single_release,
+       .write          = psi_cpu_write,
+       .poll           = psi_fop_poll,
+       .release        = psi_fop_release,
  };
  
  static int __init psi_proc_init(void)