Merge tag 'driver-core-5.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / drivers / thermal / intel / intel_powerclamp.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *      TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *           2. synchronization with other hw blocks
23  */
24
25 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60
61 /* user selected target */
62 static unsigned int set_target_ratio;
63 static unsigned int current_ratio;
64 static bool should_skip;
65 static bool reduce_irq;
66 static atomic_t idle_wakeup_counter;
67 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
68                                   * control parameters. default to BSP but BSP
69                                   * can be offlined.
70                                   */
71 static bool clamping;
72
73 static const struct sched_param sparam = {
74         .sched_priority = MAX_USER_RT_PRIO / 2,
75 };
76 struct powerclamp_worker_data {
77         struct kthread_worker *worker;
78         struct kthread_work balancing_work;
79         struct kthread_delayed_work idle_injection_work;
80         unsigned int cpu;
81         unsigned int count;
82         unsigned int guard;
83         unsigned int window_size_now;
84         unsigned int target_ratio;
85         unsigned int duration_jiffies;
86         bool clamping;
87 };
88
89 static struct powerclamp_worker_data __percpu *worker_data;
90 static struct thermal_cooling_device *cooling_dev;
91 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
92                                            * clamping kthread worker
93                                            */
94
95 static unsigned int duration;
96 static unsigned int pkg_cstate_ratio_cur;
97 static unsigned int window_size;
98
99 static int duration_set(const char *arg, const struct kernel_param *kp)
100 {
101         int ret = 0;
102         unsigned long new_duration;
103
104         ret = kstrtoul(arg, 10, &new_duration);
105         if (ret)
106                 goto exit;
107         if (new_duration > 25 || new_duration < 6) {
108                 pr_err("Out of recommended range %lu, between 6-25ms\n",
109                         new_duration);
110                 ret = -EINVAL;
111         }
112
113         duration = clamp(new_duration, 6ul, 25ul);
114         smp_mb();
115
116 exit:
117
118         return ret;
119 }
120
121 static const struct kernel_param_ops duration_ops = {
122         .set = duration_set,
123         .get = param_get_int,
124 };
125
126
127 module_param_cb(duration, &duration_ops, &duration, 0644);
128 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
129
130 struct powerclamp_calibration_data {
131         unsigned long confidence;  /* used for calibration, basically a counter
132                                     * gets incremented each time a clamping
133                                     * period is completed without extra wakeups
134                                     * once that counter is reached given level,
135                                     * compensation is deemed usable.
136                                     */
137         unsigned long steady_comp; /* steady state compensation used when
138                                     * no extra wakeups occurred.
139                                     */
140         unsigned long dynamic_comp; /* compensate excessive wakeup from idle
141                                      * mostly from external interrupts.
142                                      */
143 };
144
145 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
146
147 static int window_size_set(const char *arg, const struct kernel_param *kp)
148 {
149         int ret = 0;
150         unsigned long new_window_size;
151
152         ret = kstrtoul(arg, 10, &new_window_size);
153         if (ret)
154                 goto exit_win;
155         if (new_window_size > 10 || new_window_size < 2) {
156                 pr_err("Out of recommended window size %lu, between 2-10\n",
157                         new_window_size);
158                 ret = -EINVAL;
159         }
160
161         window_size = clamp(new_window_size, 2ul, 10ul);
162         smp_mb();
163
164 exit_win:
165
166         return ret;
167 }
168
169 static const struct kernel_param_ops window_size_ops = {
170         .set = window_size_set,
171         .get = param_get_int,
172 };
173
174 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
175 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
176         "\tpowerclamp controls idle ratio within this window. larger\n"
177         "\twindow size results in slower response time but more smooth\n"
178         "\tclamping results. default to 2.");
179
180 static void find_target_mwait(void)
181 {
182         unsigned int eax, ebx, ecx, edx;
183         unsigned int highest_cstate = 0;
184         unsigned int highest_subcstate = 0;
185         int i;
186
187         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
188                 return;
189
190         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
191
192         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
193             !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
194                 return;
195
196         edx >>= MWAIT_SUBSTATE_SIZE;
197         for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
198                 if (edx & MWAIT_SUBSTATE_MASK) {
199                         highest_cstate = i;
200                         highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
201                 }
202         }
203         target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
204                 (highest_subcstate - 1);
205
206 }
207
208 struct pkg_cstate_info {
209         bool skip;
210         int msr_index;
211         int cstate_id;
212 };
213
214 #define PKG_CSTATE_INIT(id) {                           \
215                 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
216                 .cstate_id = id                         \
217                         }
218
219 static struct pkg_cstate_info pkg_cstates[] = {
220         PKG_CSTATE_INIT(2),
221         PKG_CSTATE_INIT(3),
222         PKG_CSTATE_INIT(6),
223         PKG_CSTATE_INIT(7),
224         PKG_CSTATE_INIT(8),
225         PKG_CSTATE_INIT(9),
226         PKG_CSTATE_INIT(10),
227         {NULL},
228 };
229
230 static bool has_pkg_state_counter(void)
231 {
232         u64 val;
233         struct pkg_cstate_info *info = pkg_cstates;
234
235         /* check if any one of the counter msrs exists */
236         while (info->msr_index) {
237                 if (!rdmsrl_safe(info->msr_index, &val))
238                         return true;
239                 info++;
240         }
241
242         return false;
243 }
244
245 static u64 pkg_state_counter(void)
246 {
247         u64 val;
248         u64 count = 0;
249         struct pkg_cstate_info *info = pkg_cstates;
250
251         while (info->msr_index) {
252                 if (!info->skip) {
253                         if (!rdmsrl_safe(info->msr_index, &val))
254                                 count += val;
255                         else
256                                 info->skip = true;
257                 }
258                 info++;
259         }
260
261         return count;
262 }
263
264 static unsigned int get_compensation(int ratio)
265 {
266         unsigned int comp = 0;
267
268         /* we only use compensation if all adjacent ones are good */
269         if (ratio == 1 &&
270                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
271                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272                 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273                 comp = (cal_data[ratio].steady_comp +
274                         cal_data[ratio + 1].steady_comp +
275                         cal_data[ratio + 2].steady_comp) / 3;
276         } else if (ratio == MAX_TARGET_RATIO - 1 &&
277                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
278                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279                 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280                 comp = (cal_data[ratio].steady_comp +
281                         cal_data[ratio - 1].steady_comp +
282                         cal_data[ratio - 2].steady_comp) / 3;
283         } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286                 comp = (cal_data[ratio].steady_comp +
287                         cal_data[ratio - 1].steady_comp +
288                         cal_data[ratio + 1].steady_comp) / 3;
289         }
290
291         /* REVISIT: simple penalty of double idle injection */
292         if (reduce_irq)
293                 comp = ratio;
294         /* do not exceed limit */
295         if (comp + ratio >= MAX_TARGET_RATIO)
296                 comp = MAX_TARGET_RATIO - ratio - 1;
297
298         return comp;
299 }
300
301 static void adjust_compensation(int target_ratio, unsigned int win)
302 {
303         int delta;
304         struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305
306         /*
307          * adjust compensations if confidence level has not been reached or
308          * there are too many wakeups during the last idle injection period, we
309          * cannot trust the data for compensation.
310          */
311         if (d->confidence >= CONFIDENCE_OK ||
312                 atomic_read(&idle_wakeup_counter) >
313                 win * num_online_cpus())
314                 return;
315
316         delta = set_target_ratio - current_ratio;
317         /* filter out bad data */
318         if (delta >= 0 && delta <= (1+target_ratio/10)) {
319                 if (d->steady_comp)
320                         d->steady_comp =
321                                 roundup(delta+d->steady_comp, 2)/2;
322                 else
323                         d->steady_comp = delta;
324                 d->confidence++;
325         }
326 }
327
328 static bool powerclamp_adjust_controls(unsigned int target_ratio,
329                                 unsigned int guard, unsigned int win)
330 {
331         static u64 msr_last, tsc_last;
332         u64 msr_now, tsc_now;
333         u64 val64;
334
335         /* check result for the last window */
336         msr_now = pkg_state_counter();
337         tsc_now = rdtsc();
338
339         /* calculate pkg cstate vs tsc ratio */
340         if (!msr_last || !tsc_last)
341                 current_ratio = 1;
342         else if (tsc_now-tsc_last) {
343                 val64 = 100*(msr_now-msr_last);
344                 do_div(val64, (tsc_now-tsc_last));
345                 current_ratio = val64;
346         }
347
348         /* update record */
349         msr_last = msr_now;
350         tsc_last = tsc_now;
351
352         adjust_compensation(target_ratio, win);
353         /*
354          * too many external interrupts, set flag such
355          * that we can take measure later.
356          */
357         reduce_irq = atomic_read(&idle_wakeup_counter) >=
358                 2 * win * num_online_cpus();
359
360         atomic_set(&idle_wakeup_counter, 0);
361         /* if we are above target+guard, skip */
362         return set_target_ratio + guard <= current_ratio;
363 }
364
365 static void clamp_balancing_func(struct kthread_work *work)
366 {
367         struct powerclamp_worker_data *w_data;
368         int sleeptime;
369         unsigned long target_jiffies;
370         unsigned int compensated_ratio;
371         int interval; /* jiffies to sleep for each attempt */
372
373         w_data = container_of(work, struct powerclamp_worker_data,
374                               balancing_work);
375
376         /*
377          * make sure user selected ratio does not take effect until
378          * the next round. adjust target_ratio if user has changed
379          * target such that we can converge quickly.
380          */
381         w_data->target_ratio = READ_ONCE(set_target_ratio);
382         w_data->guard = 1 + w_data->target_ratio / 20;
383         w_data->window_size_now = window_size;
384         w_data->duration_jiffies = msecs_to_jiffies(duration);
385         w_data->count++;
386
387         /*
388          * systems may have different ability to enter package level
389          * c-states, thus we need to compensate the injected idle ratio
390          * to achieve the actual target reported by the HW.
391          */
392         compensated_ratio = w_data->target_ratio +
393                 get_compensation(w_data->target_ratio);
394         if (compensated_ratio <= 0)
395                 compensated_ratio = 1;
396         interval = w_data->duration_jiffies * 100 / compensated_ratio;
397
398         /* align idle time */
399         target_jiffies = roundup(jiffies, interval);
400         sleeptime = target_jiffies - jiffies;
401         if (sleeptime <= 0)
402                 sleeptime = 1;
403
404         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
405                 kthread_queue_delayed_work(w_data->worker,
406                                            &w_data->idle_injection_work,
407                                            sleeptime);
408 }
409
410 static void clamp_idle_injection_func(struct kthread_work *work)
411 {
412         struct powerclamp_worker_data *w_data;
413
414         w_data = container_of(work, struct powerclamp_worker_data,
415                               idle_injection_work.work);
416
417         /*
418          * only elected controlling cpu can collect stats and update
419          * control parameters.
420          */
421         if (w_data->cpu == control_cpu &&
422             !(w_data->count % w_data->window_size_now)) {
423                 should_skip =
424                         powerclamp_adjust_controls(w_data->target_ratio,
425                                                    w_data->guard,
426                                                    w_data->window_size_now);
427                 smp_mb();
428         }
429
430         if (should_skip)
431                 goto balance;
432
433         play_idle(jiffies_to_msecs(w_data->duration_jiffies));
434
435 balance:
436         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
437                 kthread_queue_work(w_data->worker, &w_data->balancing_work);
438 }
439
440 /*
441  * 1 HZ polling while clamping is active, useful for userspace
442  * to monitor actual idle ratio.
443  */
444 static void poll_pkg_cstate(struct work_struct *dummy);
445 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
446 static void poll_pkg_cstate(struct work_struct *dummy)
447 {
448         static u64 msr_last;
449         static u64 tsc_last;
450
451         u64 msr_now;
452         u64 tsc_now;
453         u64 val64;
454
455         msr_now = pkg_state_counter();
456         tsc_now = rdtsc();
457
458         /* calculate pkg cstate vs tsc ratio */
459         if (!msr_last || !tsc_last)
460                 pkg_cstate_ratio_cur = 1;
461         else {
462                 if (tsc_now - tsc_last) {
463                         val64 = 100 * (msr_now - msr_last);
464                         do_div(val64, (tsc_now - tsc_last));
465                         pkg_cstate_ratio_cur = val64;
466                 }
467         }
468
469         /* update record */
470         msr_last = msr_now;
471         tsc_last = tsc_now;
472
473         if (true == clamping)
474                 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
475 }
476
477 static void start_power_clamp_worker(unsigned long cpu)
478 {
479         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
480         struct kthread_worker *worker;
481
482         worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
483         if (IS_ERR(worker))
484                 return;
485
486         w_data->worker = worker;
487         w_data->count = 0;
488         w_data->cpu = cpu;
489         w_data->clamping = true;
490         set_bit(cpu, cpu_clamping_mask);
491         sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
492         kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
493         kthread_init_delayed_work(&w_data->idle_injection_work,
494                                   clamp_idle_injection_func);
495         kthread_queue_work(w_data->worker, &w_data->balancing_work);
496 }
497
498 static void stop_power_clamp_worker(unsigned long cpu)
499 {
500         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
501
502         if (!w_data->worker)
503                 return;
504
505         w_data->clamping = false;
506         /*
507          * Make sure that all works that get queued after this point see
508          * the clamping disabled. The counter part is not needed because
509          * there is an implicit memory barrier when the queued work
510          * is proceed.
511          */
512         smp_wmb();
513         kthread_cancel_work_sync(&w_data->balancing_work);
514         kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
515         /*
516          * The balancing work still might be queued here because
517          * the handling of the "clapming" variable, cancel, and queue
518          * operations are not synchronized via a lock. But it is not
519          * a big deal. The balancing work is fast and destroy kthread
520          * will wait for it.
521          */
522         clear_bit(w_data->cpu, cpu_clamping_mask);
523         kthread_destroy_worker(w_data->worker);
524
525         w_data->worker = NULL;
526 }
527
528 static int start_power_clamp(void)
529 {
530         unsigned long cpu;
531
532         set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
533         /* prevent cpu hotplug */
534         get_online_cpus();
535
536         /* prefer BSP */
537         control_cpu = 0;
538         if (!cpu_online(control_cpu))
539                 control_cpu = smp_processor_id();
540
541         clamping = true;
542         schedule_delayed_work(&poll_pkg_cstate_work, 0);
543
544         /* start one kthread worker per online cpu */
545         for_each_online_cpu(cpu) {
546                 start_power_clamp_worker(cpu);
547         }
548         put_online_cpus();
549
550         return 0;
551 }
552
553 static void end_power_clamp(void)
554 {
555         int i;
556
557         /*
558          * Block requeuing in all the kthread workers. They will flush and
559          * stop faster.
560          */
561         clamping = false;
562         if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
563                 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
564                         pr_debug("clamping worker for cpu %d alive, destroy\n",
565                                  i);
566                         stop_power_clamp_worker(i);
567                 }
568         }
569 }
570
571 static int powerclamp_cpu_online(unsigned int cpu)
572 {
573         if (clamping == false)
574                 return 0;
575         start_power_clamp_worker(cpu);
576         /* prefer BSP as controlling CPU */
577         if (cpu == 0) {
578                 control_cpu = 0;
579                 smp_mb();
580         }
581         return 0;
582 }
583
584 static int powerclamp_cpu_predown(unsigned int cpu)
585 {
586         if (clamping == false)
587                 return 0;
588
589         stop_power_clamp_worker(cpu);
590         if (cpu != control_cpu)
591                 return 0;
592
593         control_cpu = cpumask_first(cpu_online_mask);
594         if (control_cpu == cpu)
595                 control_cpu = cpumask_next(cpu, cpu_online_mask);
596         smp_mb();
597         return 0;
598 }
599
600 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
601                                  unsigned long *state)
602 {
603         *state = MAX_TARGET_RATIO;
604
605         return 0;
606 }
607
608 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
609                                  unsigned long *state)
610 {
611         if (true == clamping)
612                 *state = pkg_cstate_ratio_cur;
613         else
614                 /* to save power, do not poll idle ratio while not clamping */
615                 *state = -1; /* indicates invalid state */
616
617         return 0;
618 }
619
620 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
621                                  unsigned long new_target_ratio)
622 {
623         int ret = 0;
624
625         new_target_ratio = clamp(new_target_ratio, 0UL,
626                                 (unsigned long) (MAX_TARGET_RATIO-1));
627         if (set_target_ratio == 0 && new_target_ratio > 0) {
628                 pr_info("Start idle injection to reduce power\n");
629                 set_target_ratio = new_target_ratio;
630                 ret = start_power_clamp();
631                 goto exit_set;
632         } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
633                 pr_info("Stop forced idle injection\n");
634                 end_power_clamp();
635                 set_target_ratio = 0;
636         } else  /* adjust currently running */ {
637                 set_target_ratio = new_target_ratio;
638                 /* make new set_target_ratio visible to other cpus */
639                 smp_mb();
640         }
641
642 exit_set:
643         return ret;
644 }
645
646 /* bind to generic thermal layer as cooling device*/
647 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
648         .get_max_state = powerclamp_get_max_state,
649         .get_cur_state = powerclamp_get_cur_state,
650         .set_cur_state = powerclamp_set_cur_state,
651 };
652
653 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
654         { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
655         {}
656 };
657 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
658
659 static int __init powerclamp_probe(void)
660 {
661
662         if (!x86_match_cpu(intel_powerclamp_ids)) {
663                 pr_err("CPU does not support MWAIT\n");
664                 return -ENODEV;
665         }
666
667         /* The goal for idle time alignment is to achieve package cstate. */
668         if (!has_pkg_state_counter()) {
669                 pr_info("No package C-state available\n");
670                 return -ENODEV;
671         }
672
673         /* find the deepest mwait value */
674         find_target_mwait();
675
676         return 0;
677 }
678
679 static int powerclamp_debug_show(struct seq_file *m, void *unused)
680 {
681         int i = 0;
682
683         seq_printf(m, "controlling cpu: %d\n", control_cpu);
684         seq_printf(m, "pct confidence steady dynamic (compensation)\n");
685         for (i = 0; i < MAX_TARGET_RATIO; i++) {
686                 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
687                         i,
688                         cal_data[i].confidence,
689                         cal_data[i].steady_comp,
690                         cal_data[i].dynamic_comp);
691         }
692
693         return 0;
694 }
695
696 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
697
698 static inline void powerclamp_create_debug_files(void)
699 {
700         debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
701
702         debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
703                             &powerclamp_debug_fops);
704 }
705
706 static enum cpuhp_state hp_state;
707
708 static int __init powerclamp_init(void)
709 {
710         int retval;
711         int bitmap_size;
712
713         bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
714         cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
715         if (!cpu_clamping_mask)
716                 return -ENOMEM;
717
718         /* probe cpu features and ids here */
719         retval = powerclamp_probe();
720         if (retval)
721                 goto exit_free;
722
723         /* set default limit, maybe adjusted during runtime based on feedback */
724         window_size = 2;
725         retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
726                                            "thermal/intel_powerclamp:online",
727                                            powerclamp_cpu_online,
728                                            powerclamp_cpu_predown);
729         if (retval < 0)
730                 goto exit_free;
731
732         hp_state = retval;
733
734         worker_data = alloc_percpu(struct powerclamp_worker_data);
735         if (!worker_data) {
736                 retval = -ENOMEM;
737                 goto exit_unregister;
738         }
739
740         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
741                                                 &powerclamp_cooling_ops);
742         if (IS_ERR(cooling_dev)) {
743                 retval = -ENODEV;
744                 goto exit_free_thread;
745         }
746
747         if (!duration)
748                 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
749
750         powerclamp_create_debug_files();
751
752         return 0;
753
754 exit_free_thread:
755         free_percpu(worker_data);
756 exit_unregister:
757         cpuhp_remove_state_nocalls(hp_state);
758 exit_free:
759         kfree(cpu_clamping_mask);
760         return retval;
761 }
762 module_init(powerclamp_init);
763
764 static void __exit powerclamp_exit(void)
765 {
766         end_power_clamp();
767         cpuhp_remove_state_nocalls(hp_state);
768         free_percpu(worker_data);
769         thermal_cooling_device_unregister(cooling_dev);
770         kfree(cpu_clamping_mask);
771
772         cancel_delayed_work_sync(&poll_pkg_cstate_work);
773         debugfs_remove_recursive(debug_dir);
774 }
775 module_exit(powerclamp_exit);
776
777 MODULE_LICENSE("GPL");
778 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
779 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
780 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");