35fff01e87b4dbe75d480bd25cf2f1ef2c3ab800
[sfrench/cifs-2.6.git] / arch / x86 / kernel / cpu / aperfmperf.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86 APERF/MPERF KHz calculation for
4  * /sys/.../cpufreq/scaling_cur_freq
5  *
6  * Copyright (C) 2017 Intel Corp.
7  * Author: Len Brown <len.brown@intel.com>
8  */
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
19
20 #include <asm/cpu_device_id.h>
21 #include <asm/intel-family.h>
22
23 #include "cpu.h"
24
25 struct aperfmperf_sample {
26         unsigned int    khz;
27         atomic_t        scfpending;
28         ktime_t time;
29         u64     aperf;
30         u64     mperf;
31 };
32
33 static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
34
35 #define APERFMPERF_CACHE_THRESHOLD_MS   10
36 #define APERFMPERF_REFRESH_DELAY_MS     10
37 #define APERFMPERF_STALE_THRESHOLD_MS   1000
38
39 /*
40  * aperfmperf_snapshot_khz()
41  * On the current CPU, snapshot APERF, MPERF, and jiffies
42  * unless we already did it within 10ms
43  * calculate kHz, save snapshot
44  */
45 static void aperfmperf_snapshot_khz(void *dummy)
46 {
47         u64 aperf, aperf_delta;
48         u64 mperf, mperf_delta;
49         struct aperfmperf_sample *s = this_cpu_ptr(&samples);
50         unsigned long flags;
51
52         local_irq_save(flags);
53         rdmsrl(MSR_IA32_APERF, aperf);
54         rdmsrl(MSR_IA32_MPERF, mperf);
55         local_irq_restore(flags);
56
57         aperf_delta = aperf - s->aperf;
58         mperf_delta = mperf - s->mperf;
59
60         /*
61          * There is no architectural guarantee that MPERF
62          * increments faster than we can read it.
63          */
64         if (mperf_delta == 0)
65                 return;
66
67         s->time = ktime_get();
68         s->aperf = aperf;
69         s->mperf = mperf;
70         s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
71         atomic_set_release(&s->scfpending, 0);
72 }
73
74 static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
75 {
76         s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
77         struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
78
79         /* Don't bother re-computing within the cache threshold time. */
80         if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
81                 return true;
82
83         if (!atomic_xchg(&s->scfpending, 1) || wait)
84                 smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
85
86         /* Return false if the previous iteration was too long ago. */
87         return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
88 }
89
90 unsigned int aperfmperf_get_khz(int cpu)
91 {
92         if (!cpu_khz)
93                 return 0;
94
95         if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
96                 return 0;
97
98         if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
99                 return 0;
100
101         if (rcu_is_idle_cpu(cpu))
102                 return 0; /* Idle CPUs are completely uninteresting. */
103
104         aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
105         return per_cpu(samples.khz, cpu);
106 }
107
108 void arch_freq_prepare_all(void)
109 {
110         ktime_t now = ktime_get();
111         bool wait = false;
112         int cpu;
113
114         if (!cpu_khz)
115                 return;
116
117         if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
118                 return;
119
120         for_each_online_cpu(cpu) {
121                 if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
122                         continue;
123                 if (rcu_is_idle_cpu(cpu))
124                         continue; /* Idle CPUs are completely uninteresting. */
125                 if (!aperfmperf_snapshot_cpu(cpu, now, false))
126                         wait = true;
127         }
128
129         if (wait)
130                 msleep(APERFMPERF_REFRESH_DELAY_MS);
131 }
132
133 unsigned int arch_freq_get_on_cpu(int cpu)
134 {
135         struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
136
137         if (!cpu_khz)
138                 return 0;
139
140         if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
141                 return 0;
142
143         if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
144                 return 0;
145
146         if (rcu_is_idle_cpu(cpu))
147                 return 0;
148
149         if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
150                 return per_cpu(samples.khz, cpu);
151
152         msleep(APERFMPERF_REFRESH_DELAY_MS);
153         atomic_set(&s->scfpending, 1);
154         smp_mb(); /* ->scfpending before smp_call_function_single(). */
155         smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
156
157         return per_cpu(samples.khz, cpu);
158 }
159
160 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
161 /*
162  * APERF/MPERF frequency ratio computation.
163  *
164  * The scheduler wants to do frequency invariant accounting and needs a <1
165  * ratio to account for the 'current' frequency, corresponding to
166  * freq_curr / freq_max.
167  *
168  * Since the frequency freq_curr on x86 is controlled by micro-controller and
169  * our P-state setting is little more than a request/hint, we need to observe
170  * the effective frequency 'BusyMHz', i.e. the average frequency over a time
171  * interval after discarding idle time. This is given by:
172  *
173  *   BusyMHz = delta_APERF / delta_MPERF * freq_base
174  *
175  * where freq_base is the max non-turbo P-state.
176  *
177  * The freq_max term has to be set to a somewhat arbitrary value, because we
178  * can't know which turbo states will be available at a given point in time:
179  * it all depends on the thermal headroom of the entire package. We set it to
180  * the turbo level with 4 cores active.
181  *
182  * Benchmarks show that's a good compromise between the 1C turbo ratio
183  * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
184  * which would ignore the entire turbo range (a conspicuous part, making
185  * freq_curr/freq_max always maxed out).
186  *
187  * An exception to the heuristic above is the Atom uarch, where we choose the
188  * highest turbo level for freq_max since Atom's are generally oriented towards
189  * power efficiency.
190  *
191  * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
192  * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
193  */
194
195 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
196
197 static DEFINE_PER_CPU(u64, arch_prev_aperf);
198 static DEFINE_PER_CPU(u64, arch_prev_mperf);
199 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
200 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
201
202 void arch_set_max_freq_ratio(bool turbo_disabled)
203 {
204         arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
205                                         arch_turbo_freq_ratio;
206 }
207 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
208
209 static bool turbo_disabled(void)
210 {
211         u64 misc_en;
212         int err;
213
214         err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
215         if (err)
216                 return false;
217
218         return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
219 }
220
221 static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
222 {
223         int err;
224
225         err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
226         if (err)
227                 return false;
228
229         err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
230         if (err)
231                 return false;
232
233         *base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
234         *turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
235
236         return true;
237 }
238
239 #define X86_MATCH(model)                                        \
240         X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6,            \
241                 INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
242
243 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
244         X86_MATCH(XEON_PHI_KNL),
245         X86_MATCH(XEON_PHI_KNM),
246         {}
247 };
248
249 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
250         X86_MATCH(SKYLAKE_X),
251         {}
252 };
253
254 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
255         X86_MATCH(ATOM_GOLDMONT),
256         X86_MATCH(ATOM_GOLDMONT_D),
257         X86_MATCH(ATOM_GOLDMONT_PLUS),
258         {}
259 };
260
261 static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
262                                 int num_delta_fratio)
263 {
264         int fratio, delta_fratio, found;
265         int err, i;
266         u64 msr;
267
268         err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
269         if (err)
270                 return false;
271
272         *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
273
274         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
275         if (err)
276                 return false;
277
278         fratio = (msr >> 8) & 0xFF;
279         i = 16;
280         found = 0;
281         do {
282                 if (found >= num_delta_fratio) {
283                         *turbo_freq = fratio;
284                         return true;
285                 }
286
287                 delta_fratio = (msr >> (i + 5)) & 0x7;
288
289                 if (delta_fratio) {
290                         found += 1;
291                         fratio -= delta_fratio;
292                 }
293
294                 i += 8;
295         } while (i < 64);
296
297         return true;
298 }
299
300 static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
301 {
302         u64 ratios, counts;
303         u32 group_size;
304         int err, i;
305
306         err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
307         if (err)
308                 return false;
309
310         *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
311
312         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
313         if (err)
314                 return false;
315
316         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
317         if (err)
318                 return false;
319
320         for (i = 0; i < 64; i += 8) {
321                 group_size = (counts >> i) & 0xFF;
322                 if (group_size >= size) {
323                         *turbo_freq = (ratios >> i) & 0xFF;
324                         return true;
325                 }
326         }
327
328         return false;
329 }
330
331 static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
332 {
333         u64 msr;
334         int err;
335
336         err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
337         if (err)
338                 return false;
339
340         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
341         if (err)
342                 return false;
343
344         *base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
345         *turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */
346
347         /* The CPU may have less than 4 cores */
348         if (!*turbo_freq)
349                 *turbo_freq = msr & 0xFF;         /* 1C turbo    */
350
351         return true;
352 }
353
354 static bool intel_set_max_freq_ratio(void)
355 {
356         u64 base_freq, turbo_freq;
357         u64 turbo_ratio;
358
359         if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
360                 goto out;
361
362         if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
363             skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
364                 goto out;
365
366         if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
367             knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
368                 goto out;
369
370         if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
371             skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
372                 goto out;
373
374         if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
375                 goto out;
376
377         return false;
378
379 out:
380         /*
381          * Some hypervisors advertise X86_FEATURE_APERFMPERF
382          * but then fill all MSR's with zeroes.
383          * Some CPUs have turbo boost but don't declare any turbo ratio
384          * in MSR_TURBO_RATIO_LIMIT.
385          */
386         if (!base_freq || !turbo_freq) {
387                 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
388                 return false;
389         }
390
391         turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
392         if (!turbo_ratio) {
393                 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
394                 return false;
395         }
396
397         arch_turbo_freq_ratio = turbo_ratio;
398         arch_set_max_freq_ratio(turbo_disabled());
399
400         return true;
401 }
402
403 static void init_counter_refs(void)
404 {
405         u64 aperf, mperf;
406
407         rdmsrl(MSR_IA32_APERF, aperf);
408         rdmsrl(MSR_IA32_MPERF, mperf);
409
410         this_cpu_write(arch_prev_aperf, aperf);
411         this_cpu_write(arch_prev_mperf, mperf);
412 }
413
414 #ifdef CONFIG_PM_SLEEP
415 static struct syscore_ops freq_invariance_syscore_ops = {
416         .resume = init_counter_refs,
417 };
418
419 static void register_freq_invariance_syscore_ops(void)
420 {
421         /* Bail out if registered already. */
422         if (freq_invariance_syscore_ops.node.prev)
423                 return;
424
425         register_syscore_ops(&freq_invariance_syscore_ops);
426 }
427 #else
428 static inline void register_freq_invariance_syscore_ops(void) {}
429 #endif
430
431 void init_freq_invariance(bool secondary, bool cppc_ready)
432 {
433         bool ret = false;
434
435         if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
436                 return;
437
438         if (secondary) {
439                 if (static_branch_likely(&arch_scale_freq_key)) {
440                         init_counter_refs();
441                 }
442                 return;
443         }
444
445         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
446                 ret = intel_set_max_freq_ratio();
447         else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
448                 if (!cppc_ready) {
449                         return;
450                 }
451                 ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio);
452         }
453
454         if (ret) {
455                 init_counter_refs();
456                 static_branch_enable(&arch_scale_freq_key);
457                 register_freq_invariance_syscore_ops();
458                 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
459         } else {
460                 pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
461         }
462 }
463
464 static void disable_freq_invariance_workfn(struct work_struct *work)
465 {
466         static_branch_disable(&arch_scale_freq_key);
467 }
468
469 static DECLARE_WORK(disable_freq_invariance_work,
470                     disable_freq_invariance_workfn);
471
472 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
473
474 void arch_scale_freq_tick(void)
475 {
476         u64 freq_scale;
477         u64 aperf, mperf;
478         u64 acnt, mcnt;
479
480         if (!arch_scale_freq_invariant())
481                 return;
482
483         rdmsrl(MSR_IA32_APERF, aperf);
484         rdmsrl(MSR_IA32_MPERF, mperf);
485
486         acnt = aperf - this_cpu_read(arch_prev_aperf);
487         mcnt = mperf - this_cpu_read(arch_prev_mperf);
488
489         this_cpu_write(arch_prev_aperf, aperf);
490         this_cpu_write(arch_prev_mperf, mperf);
491
492         if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
493                 goto error;
494
495         if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
496                 goto error;
497
498         freq_scale = div64_u64(acnt, mcnt);
499         if (!freq_scale)
500                 goto error;
501
502         if (freq_scale > SCHED_CAPACITY_SCALE)
503                 freq_scale = SCHED_CAPACITY_SCALE;
504
505         this_cpu_write(arch_freq_scale, freq_scale);
506         return;
507
508 error:
509         pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
510         schedule_work(&disable_freq_invariance_work);
511 }
512 #endif /* CONFIG_X86_64 && CONFIG_SMP */