Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux
[sfrench/cifs-2.6.git] / drivers / thermal / intel / intel_powerclamp.c
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *      TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differentiate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *           2. synchronization with other hw blocks
36  *
37  *
38  */
39
40 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
41
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/cpu.h>
47 #include <linux/thermal.h>
48 #include <linux/slab.h>
49 #include <linux/tick.h>
50 #include <linux/debugfs.h>
51 #include <linux/seq_file.h>
52 #include <linux/sched/rt.h>
53 #include <uapi/linux/sched/types.h>
54
55 #include <asm/nmi.h>
56 #include <asm/msr.h>
57 #include <asm/mwait.h>
58 #include <asm/cpu_device_id.h>
59 #include <asm/hardirq.h>
60
61 #define MAX_TARGET_RATIO (50U)
62 /* For each undisturbed clamping period (no extra wake ups during idle time),
63  * we increment the confidence counter for the given target ratio.
64  * CONFIDENCE_OK defines the level where runtime calibration results are
65  * valid.
66  */
67 #define CONFIDENCE_OK (3)
68 /* Default idle injection duration, driver adjust sleep time to meet target
69  * idle ratio. Similar to frequency modulation.
70  */
71 #define DEFAULT_DURATION_JIFFIES (6)
72
73 static unsigned int target_mwait;
74 static struct dentry *debug_dir;
75
76 /* user selected target */
77 static unsigned int set_target_ratio;
78 static unsigned int current_ratio;
79 static bool should_skip;
80 static bool reduce_irq;
81 static atomic_t idle_wakeup_counter;
82 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
83                                   * control parameters. default to BSP but BSP
84                                   * can be offlined.
85                                   */
86 static bool clamping;
87
88 static const struct sched_param sparam = {
89         .sched_priority = MAX_USER_RT_PRIO / 2,
90 };
91 struct powerclamp_worker_data {
92         struct kthread_worker *worker;
93         struct kthread_work balancing_work;
94         struct kthread_delayed_work idle_injection_work;
95         unsigned int cpu;
96         unsigned int count;
97         unsigned int guard;
98         unsigned int window_size_now;
99         unsigned int target_ratio;
100         unsigned int duration_jiffies;
101         bool clamping;
102 };
103
104 static struct powerclamp_worker_data * __percpu worker_data;
105 static struct thermal_cooling_device *cooling_dev;
106 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
107                                            * clamping kthread worker
108                                            */
109
110 static unsigned int duration;
111 static unsigned int pkg_cstate_ratio_cur;
112 static unsigned int window_size;
113
114 static int duration_set(const char *arg, const struct kernel_param *kp)
115 {
116         int ret = 0;
117         unsigned long new_duration;
118
119         ret = kstrtoul(arg, 10, &new_duration);
120         if (ret)
121                 goto exit;
122         if (new_duration > 25 || new_duration < 6) {
123                 pr_err("Out of recommended range %lu, between 6-25ms\n",
124                         new_duration);
125                 ret = -EINVAL;
126         }
127
128         duration = clamp(new_duration, 6ul, 25ul);
129         smp_mb();
130
131 exit:
132
133         return ret;
134 }
135
136 static const struct kernel_param_ops duration_ops = {
137         .set = duration_set,
138         .get = param_get_int,
139 };
140
141
142 module_param_cb(duration, &duration_ops, &duration, 0644);
143 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
144
145 struct powerclamp_calibration_data {
146         unsigned long confidence;  /* used for calibration, basically a counter
147                                     * gets incremented each time a clamping
148                                     * period is completed without extra wakeups
149                                     * once that counter is reached given level,
150                                     * compensation is deemed usable.
151                                     */
152         unsigned long steady_comp; /* steady state compensation used when
153                                     * no extra wakeups occurred.
154                                     */
155         unsigned long dynamic_comp; /* compensate excessive wakeup from idle
156                                      * mostly from external interrupts.
157                                      */
158 };
159
160 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
161
162 static int window_size_set(const char *arg, const struct kernel_param *kp)
163 {
164         int ret = 0;
165         unsigned long new_window_size;
166
167         ret = kstrtoul(arg, 10, &new_window_size);
168         if (ret)
169                 goto exit_win;
170         if (new_window_size > 10 || new_window_size < 2) {
171                 pr_err("Out of recommended window size %lu, between 2-10\n",
172                         new_window_size);
173                 ret = -EINVAL;
174         }
175
176         window_size = clamp(new_window_size, 2ul, 10ul);
177         smp_mb();
178
179 exit_win:
180
181         return ret;
182 }
183
184 static const struct kernel_param_ops window_size_ops = {
185         .set = window_size_set,
186         .get = param_get_int,
187 };
188
189 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
190 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
191         "\tpowerclamp controls idle ratio within this window. larger\n"
192         "\twindow size results in slower response time but more smooth\n"
193         "\tclamping results. default to 2.");
194
195 static void find_target_mwait(void)
196 {
197         unsigned int eax, ebx, ecx, edx;
198         unsigned int highest_cstate = 0;
199         unsigned int highest_subcstate = 0;
200         int i;
201
202         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
203                 return;
204
205         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
206
207         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
208             !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
209                 return;
210
211         edx >>= MWAIT_SUBSTATE_SIZE;
212         for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
213                 if (edx & MWAIT_SUBSTATE_MASK) {
214                         highest_cstate = i;
215                         highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
216                 }
217         }
218         target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
219                 (highest_subcstate - 1);
220
221 }
222
223 struct pkg_cstate_info {
224         bool skip;
225         int msr_index;
226         int cstate_id;
227 };
228
229 #define PKG_CSTATE_INIT(id) {                           \
230                 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
231                 .cstate_id = id                         \
232                         }
233
234 static struct pkg_cstate_info pkg_cstates[] = {
235         PKG_CSTATE_INIT(2),
236         PKG_CSTATE_INIT(3),
237         PKG_CSTATE_INIT(6),
238         PKG_CSTATE_INIT(7),
239         PKG_CSTATE_INIT(8),
240         PKG_CSTATE_INIT(9),
241         PKG_CSTATE_INIT(10),
242         {NULL},
243 };
244
245 static bool has_pkg_state_counter(void)
246 {
247         u64 val;
248         struct pkg_cstate_info *info = pkg_cstates;
249
250         /* check if any one of the counter msrs exists */
251         while (info->msr_index) {
252                 if (!rdmsrl_safe(info->msr_index, &val))
253                         return true;
254                 info++;
255         }
256
257         return false;
258 }
259
260 static u64 pkg_state_counter(void)
261 {
262         u64 val;
263         u64 count = 0;
264         struct pkg_cstate_info *info = pkg_cstates;
265
266         while (info->msr_index) {
267                 if (!info->skip) {
268                         if (!rdmsrl_safe(info->msr_index, &val))
269                                 count += val;
270                         else
271                                 info->skip = true;
272                 }
273                 info++;
274         }
275
276         return count;
277 }
278
279 static unsigned int get_compensation(int ratio)
280 {
281         unsigned int comp = 0;
282
283         /* we only use compensation if all adjacent ones are good */
284         if (ratio == 1 &&
285                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
286                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
287                 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
288                 comp = (cal_data[ratio].steady_comp +
289                         cal_data[ratio + 1].steady_comp +
290                         cal_data[ratio + 2].steady_comp) / 3;
291         } else if (ratio == MAX_TARGET_RATIO - 1 &&
292                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
293                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
294                 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
295                 comp = (cal_data[ratio].steady_comp +
296                         cal_data[ratio - 1].steady_comp +
297                         cal_data[ratio - 2].steady_comp) / 3;
298         } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
299                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
300                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
301                 comp = (cal_data[ratio].steady_comp +
302                         cal_data[ratio - 1].steady_comp +
303                         cal_data[ratio + 1].steady_comp) / 3;
304         }
305
306         /* REVISIT: simple penalty of double idle injection */
307         if (reduce_irq)
308                 comp = ratio;
309         /* do not exceed limit */
310         if (comp + ratio >= MAX_TARGET_RATIO)
311                 comp = MAX_TARGET_RATIO - ratio - 1;
312
313         return comp;
314 }
315
316 static void adjust_compensation(int target_ratio, unsigned int win)
317 {
318         int delta;
319         struct powerclamp_calibration_data *d = &cal_data[target_ratio];
320
321         /*
322          * adjust compensations if confidence level has not been reached or
323          * there are too many wakeups during the last idle injection period, we
324          * cannot trust the data for compensation.
325          */
326         if (d->confidence >= CONFIDENCE_OK ||
327                 atomic_read(&idle_wakeup_counter) >
328                 win * num_online_cpus())
329                 return;
330
331         delta = set_target_ratio - current_ratio;
332         /* filter out bad data */
333         if (delta >= 0 && delta <= (1+target_ratio/10)) {
334                 if (d->steady_comp)
335                         d->steady_comp =
336                                 roundup(delta+d->steady_comp, 2)/2;
337                 else
338                         d->steady_comp = delta;
339                 d->confidence++;
340         }
341 }
342
343 static bool powerclamp_adjust_controls(unsigned int target_ratio,
344                                 unsigned int guard, unsigned int win)
345 {
346         static u64 msr_last, tsc_last;
347         u64 msr_now, tsc_now;
348         u64 val64;
349
350         /* check result for the last window */
351         msr_now = pkg_state_counter();
352         tsc_now = rdtsc();
353
354         /* calculate pkg cstate vs tsc ratio */
355         if (!msr_last || !tsc_last)
356                 current_ratio = 1;
357         else if (tsc_now-tsc_last) {
358                 val64 = 100*(msr_now-msr_last);
359                 do_div(val64, (tsc_now-tsc_last));
360                 current_ratio = val64;
361         }
362
363         /* update record */
364         msr_last = msr_now;
365         tsc_last = tsc_now;
366
367         adjust_compensation(target_ratio, win);
368         /*
369          * too many external interrupts, set flag such
370          * that we can take measure later.
371          */
372         reduce_irq = atomic_read(&idle_wakeup_counter) >=
373                 2 * win * num_online_cpus();
374
375         atomic_set(&idle_wakeup_counter, 0);
376         /* if we are above target+guard, skip */
377         return set_target_ratio + guard <= current_ratio;
378 }
379
380 static void clamp_balancing_func(struct kthread_work *work)
381 {
382         struct powerclamp_worker_data *w_data;
383         int sleeptime;
384         unsigned long target_jiffies;
385         unsigned int compensated_ratio;
386         int interval; /* jiffies to sleep for each attempt */
387
388         w_data = container_of(work, struct powerclamp_worker_data,
389                               balancing_work);
390
391         /*
392          * make sure user selected ratio does not take effect until
393          * the next round. adjust target_ratio if user has changed
394          * target such that we can converge quickly.
395          */
396         w_data->target_ratio = READ_ONCE(set_target_ratio);
397         w_data->guard = 1 + w_data->target_ratio / 20;
398         w_data->window_size_now = window_size;
399         w_data->duration_jiffies = msecs_to_jiffies(duration);
400         w_data->count++;
401
402         /*
403          * systems may have different ability to enter package level
404          * c-states, thus we need to compensate the injected idle ratio
405          * to achieve the actual target reported by the HW.
406          */
407         compensated_ratio = w_data->target_ratio +
408                 get_compensation(w_data->target_ratio);
409         if (compensated_ratio <= 0)
410                 compensated_ratio = 1;
411         interval = w_data->duration_jiffies * 100 / compensated_ratio;
412
413         /* align idle time */
414         target_jiffies = roundup(jiffies, interval);
415         sleeptime = target_jiffies - jiffies;
416         if (sleeptime <= 0)
417                 sleeptime = 1;
418
419         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
420                 kthread_queue_delayed_work(w_data->worker,
421                                            &w_data->idle_injection_work,
422                                            sleeptime);
423 }
424
425 static void clamp_idle_injection_func(struct kthread_work *work)
426 {
427         struct powerclamp_worker_data *w_data;
428
429         w_data = container_of(work, struct powerclamp_worker_data,
430                               idle_injection_work.work);
431
432         /*
433          * only elected controlling cpu can collect stats and update
434          * control parameters.
435          */
436         if (w_data->cpu == control_cpu &&
437             !(w_data->count % w_data->window_size_now)) {
438                 should_skip =
439                         powerclamp_adjust_controls(w_data->target_ratio,
440                                                    w_data->guard,
441                                                    w_data->window_size_now);
442                 smp_mb();
443         }
444
445         if (should_skip)
446                 goto balance;
447
448         play_idle(jiffies_to_msecs(w_data->duration_jiffies));
449
450 balance:
451         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
452                 kthread_queue_work(w_data->worker, &w_data->balancing_work);
453 }
454
455 /*
456  * 1 HZ polling while clamping is active, useful for userspace
457  * to monitor actual idle ratio.
458  */
459 static void poll_pkg_cstate(struct work_struct *dummy);
460 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
461 static void poll_pkg_cstate(struct work_struct *dummy)
462 {
463         static u64 msr_last;
464         static u64 tsc_last;
465
466         u64 msr_now;
467         u64 tsc_now;
468         u64 val64;
469
470         msr_now = pkg_state_counter();
471         tsc_now = rdtsc();
472
473         /* calculate pkg cstate vs tsc ratio */
474         if (!msr_last || !tsc_last)
475                 pkg_cstate_ratio_cur = 1;
476         else {
477                 if (tsc_now - tsc_last) {
478                         val64 = 100 * (msr_now - msr_last);
479                         do_div(val64, (tsc_now - tsc_last));
480                         pkg_cstate_ratio_cur = val64;
481                 }
482         }
483
484         /* update record */
485         msr_last = msr_now;
486         tsc_last = tsc_now;
487
488         if (true == clamping)
489                 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
490 }
491
492 static void start_power_clamp_worker(unsigned long cpu)
493 {
494         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
495         struct kthread_worker *worker;
496
497         worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
498         if (IS_ERR(worker))
499                 return;
500
501         w_data->worker = worker;
502         w_data->count = 0;
503         w_data->cpu = cpu;
504         w_data->clamping = true;
505         set_bit(cpu, cpu_clamping_mask);
506         sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
507         kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
508         kthread_init_delayed_work(&w_data->idle_injection_work,
509                                   clamp_idle_injection_func);
510         kthread_queue_work(w_data->worker, &w_data->balancing_work);
511 }
512
513 static void stop_power_clamp_worker(unsigned long cpu)
514 {
515         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
516
517         if (!w_data->worker)
518                 return;
519
520         w_data->clamping = false;
521         /*
522          * Make sure that all works that get queued after this point see
523          * the clamping disabled. The counter part is not needed because
524          * there is an implicit memory barrier when the queued work
525          * is proceed.
526          */
527         smp_wmb();
528         kthread_cancel_work_sync(&w_data->balancing_work);
529         kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
530         /*
531          * The balancing work still might be queued here because
532          * the handling of the "clapming" variable, cancel, and queue
533          * operations are not synchronized via a lock. But it is not
534          * a big deal. The balancing work is fast and destroy kthread
535          * will wait for it.
536          */
537         clear_bit(w_data->cpu, cpu_clamping_mask);
538         kthread_destroy_worker(w_data->worker);
539
540         w_data->worker = NULL;
541 }
542
543 static int start_power_clamp(void)
544 {
545         unsigned long cpu;
546
547         set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
548         /* prevent cpu hotplug */
549         get_online_cpus();
550
551         /* prefer BSP */
552         control_cpu = 0;
553         if (!cpu_online(control_cpu))
554                 control_cpu = smp_processor_id();
555
556         clamping = true;
557         schedule_delayed_work(&poll_pkg_cstate_work, 0);
558
559         /* start one kthread worker per online cpu */
560         for_each_online_cpu(cpu) {
561                 start_power_clamp_worker(cpu);
562         }
563         put_online_cpus();
564
565         return 0;
566 }
567
568 static void end_power_clamp(void)
569 {
570         int i;
571
572         /*
573          * Block requeuing in all the kthread workers. They will flush and
574          * stop faster.
575          */
576         clamping = false;
577         if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
578                 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
579                         pr_debug("clamping worker for cpu %d alive, destroy\n",
580                                  i);
581                         stop_power_clamp_worker(i);
582                 }
583         }
584 }
585
586 static int powerclamp_cpu_online(unsigned int cpu)
587 {
588         if (clamping == false)
589                 return 0;
590         start_power_clamp_worker(cpu);
591         /* prefer BSP as controlling CPU */
592         if (cpu == 0) {
593                 control_cpu = 0;
594                 smp_mb();
595         }
596         return 0;
597 }
598
599 static int powerclamp_cpu_predown(unsigned int cpu)
600 {
601         if (clamping == false)
602                 return 0;
603
604         stop_power_clamp_worker(cpu);
605         if (cpu != control_cpu)
606                 return 0;
607
608         control_cpu = cpumask_first(cpu_online_mask);
609         if (control_cpu == cpu)
610                 control_cpu = cpumask_next(cpu, cpu_online_mask);
611         smp_mb();
612         return 0;
613 }
614
615 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
616                                  unsigned long *state)
617 {
618         *state = MAX_TARGET_RATIO;
619
620         return 0;
621 }
622
623 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
624                                  unsigned long *state)
625 {
626         if (true == clamping)
627                 *state = pkg_cstate_ratio_cur;
628         else
629                 /* to save power, do not poll idle ratio while not clamping */
630                 *state = -1; /* indicates invalid state */
631
632         return 0;
633 }
634
635 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
636                                  unsigned long new_target_ratio)
637 {
638         int ret = 0;
639
640         new_target_ratio = clamp(new_target_ratio, 0UL,
641                                 (unsigned long) (MAX_TARGET_RATIO-1));
642         if (set_target_ratio == 0 && new_target_ratio > 0) {
643                 pr_info("Start idle injection to reduce power\n");
644                 set_target_ratio = new_target_ratio;
645                 ret = start_power_clamp();
646                 goto exit_set;
647         } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
648                 pr_info("Stop forced idle injection\n");
649                 end_power_clamp();
650                 set_target_ratio = 0;
651         } else  /* adjust currently running */ {
652                 set_target_ratio = new_target_ratio;
653                 /* make new set_target_ratio visible to other cpus */
654                 smp_mb();
655         }
656
657 exit_set:
658         return ret;
659 }
660
661 /* bind to generic thermal layer as cooling device*/
662 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
663         .get_max_state = powerclamp_get_max_state,
664         .get_cur_state = powerclamp_get_cur_state,
665         .set_cur_state = powerclamp_set_cur_state,
666 };
667
668 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
669         { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
670         {}
671 };
672 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
673
674 static int __init powerclamp_probe(void)
675 {
676
677         if (!x86_match_cpu(intel_powerclamp_ids)) {
678                 pr_err("CPU does not support MWAIT\n");
679                 return -ENODEV;
680         }
681
682         /* The goal for idle time alignment is to achieve package cstate. */
683         if (!has_pkg_state_counter()) {
684                 pr_info("No package C-state available\n");
685                 return -ENODEV;
686         }
687
688         /* find the deepest mwait value */
689         find_target_mwait();
690
691         return 0;
692 }
693
694 static int powerclamp_debug_show(struct seq_file *m, void *unused)
695 {
696         int i = 0;
697
698         seq_printf(m, "controlling cpu: %d\n", control_cpu);
699         seq_printf(m, "pct confidence steady dynamic (compensation)\n");
700         for (i = 0; i < MAX_TARGET_RATIO; i++) {
701                 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
702                         i,
703                         cal_data[i].confidence,
704                         cal_data[i].steady_comp,
705                         cal_data[i].dynamic_comp);
706         }
707
708         return 0;
709 }
710
711 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
712
713 static inline void powerclamp_create_debug_files(void)
714 {
715         debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
716         if (!debug_dir)
717                 return;
718
719         if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
720                                         cal_data, &powerclamp_debug_fops))
721                 goto file_error;
722
723         return;
724
725 file_error:
726         debugfs_remove_recursive(debug_dir);
727 }
728
729 static enum cpuhp_state hp_state;
730
731 static int __init powerclamp_init(void)
732 {
733         int retval;
734         int bitmap_size;
735
736         bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
737         cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
738         if (!cpu_clamping_mask)
739                 return -ENOMEM;
740
741         /* probe cpu features and ids here */
742         retval = powerclamp_probe();
743         if (retval)
744                 goto exit_free;
745
746         /* set default limit, maybe adjusted during runtime based on feedback */
747         window_size = 2;
748         retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
749                                            "thermal/intel_powerclamp:online",
750                                            powerclamp_cpu_online,
751                                            powerclamp_cpu_predown);
752         if (retval < 0)
753                 goto exit_free;
754
755         hp_state = retval;
756
757         worker_data = alloc_percpu(struct powerclamp_worker_data);
758         if (!worker_data) {
759                 retval = -ENOMEM;
760                 goto exit_unregister;
761         }
762
763         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
764                                                 &powerclamp_cooling_ops);
765         if (IS_ERR(cooling_dev)) {
766                 retval = -ENODEV;
767                 goto exit_free_thread;
768         }
769
770         if (!duration)
771                 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
772
773         powerclamp_create_debug_files();
774
775         return 0;
776
777 exit_free_thread:
778         free_percpu(worker_data);
779 exit_unregister:
780         cpuhp_remove_state_nocalls(hp_state);
781 exit_free:
782         kfree(cpu_clamping_mask);
783         return retval;
784 }
785 module_init(powerclamp_init);
786
787 static void __exit powerclamp_exit(void)
788 {
789         end_power_clamp();
790         cpuhp_remove_state_nocalls(hp_state);
791         free_percpu(worker_data);
792         thermal_cooling_device_unregister(cooling_dev);
793         kfree(cpu_clamping_mask);
794
795         cancel_delayed_work_sync(&poll_pkg_cstate_work);
796         debugfs_remove_recursive(debug_dir);
797 }
798 module_exit(powerclamp_exit);
799
800 MODULE_LICENSE("GPL");
801 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
802 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
803 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");