1 // SPDX-License-Identifier: GPL-2.0-only
3 * x86 APERF/MPERF KHz calculation for
4 * /sys/.../cpufreq/scaling_cur_freq
6 * Copyright (C) 2017 Intel Corp.
7 * Author: Len Brown <len.brown@intel.com>
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
20 #include <asm/cpu_device_id.h>
21 #include <asm/intel-family.h>
25 struct aperfmperf_sample {
33 static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
35 #define APERFMPERF_CACHE_THRESHOLD_MS 10
36 #define APERFMPERF_REFRESH_DELAY_MS 10
37 #define APERFMPERF_STALE_THRESHOLD_MS 1000
40 * aperfmperf_snapshot_khz()
41 * On the current CPU, snapshot APERF, MPERF, and jiffies
42 * unless we already did it within 10ms
43 * calculate kHz, save snapshot
45 static void aperfmperf_snapshot_khz(void *dummy)
47 u64 aperf, aperf_delta;
48 u64 mperf, mperf_delta;
49 struct aperfmperf_sample *s = this_cpu_ptr(&samples);
52 local_irq_save(flags);
53 rdmsrl(MSR_IA32_APERF, aperf);
54 rdmsrl(MSR_IA32_MPERF, mperf);
55 local_irq_restore(flags);
57 aperf_delta = aperf - s->aperf;
58 mperf_delta = mperf - s->mperf;
61 * There is no architectural guarantee that MPERF
62 * increments faster than we can read it.
67 s->time = ktime_get();
70 s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
71 atomic_set_release(&s->scfpending, 0);
74 static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
76 s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
77 struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
79 /* Don't bother re-computing within the cache threshold time. */
80 if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
83 if (!atomic_xchg(&s->scfpending, 1) || wait)
84 smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
86 /* Return false if the previous iteration was too long ago. */
87 return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
90 unsigned int aperfmperf_get_khz(int cpu)
95 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
98 if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
101 if (rcu_is_idle_cpu(cpu))
102 return 0; /* Idle CPUs are completely uninteresting. */
104 aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
105 return per_cpu(samples.khz, cpu);
108 void arch_freq_prepare_all(void)
110 ktime_t now = ktime_get();
117 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
120 for_each_online_cpu(cpu) {
121 if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
123 if (rcu_is_idle_cpu(cpu))
124 continue; /* Idle CPUs are completely uninteresting. */
125 if (!aperfmperf_snapshot_cpu(cpu, now, false))
130 msleep(APERFMPERF_REFRESH_DELAY_MS);
133 unsigned int arch_freq_get_on_cpu(int cpu)
135 struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
140 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
143 if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
146 if (rcu_is_idle_cpu(cpu))
149 if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
150 return per_cpu(samples.khz, cpu);
152 msleep(APERFMPERF_REFRESH_DELAY_MS);
153 atomic_set(&s->scfpending, 1);
154 smp_mb(); /* ->scfpending before smp_call_function_single(). */
155 smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
157 return per_cpu(samples.khz, cpu);
160 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
162 * APERF/MPERF frequency ratio computation.
164 * The scheduler wants to do frequency invariant accounting and needs a <1
165 * ratio to account for the 'current' frequency, corresponding to
166 * freq_curr / freq_max.
168 * Since the frequency freq_curr on x86 is controlled by micro-controller and
169 * our P-state setting is little more than a request/hint, we need to observe
170 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
171 * interval after discarding idle time. This is given by:
173 * BusyMHz = delta_APERF / delta_MPERF * freq_base
175 * where freq_base is the max non-turbo P-state.
177 * The freq_max term has to be set to a somewhat arbitrary value, because we
178 * can't know which turbo states will be available at a given point in time:
179 * it all depends on the thermal headroom of the entire package. We set it to
180 * the turbo level with 4 cores active.
182 * Benchmarks show that's a good compromise between the 1C turbo ratio
183 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
184 * which would ignore the entire turbo range (a conspicuous part, making
185 * freq_curr/freq_max always maxed out).
187 * An exception to the heuristic above is the Atom uarch, where we choose the
188 * highest turbo level for freq_max since Atom's are generally oriented towards
191 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
192 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
195 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
197 static DEFINE_PER_CPU(u64, arch_prev_aperf);
198 static DEFINE_PER_CPU(u64, arch_prev_mperf);
199 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
200 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
202 void arch_set_max_freq_ratio(bool turbo_disabled)
204 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
205 arch_turbo_freq_ratio;
207 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
209 static bool turbo_disabled(void)
214 err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
218 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
221 static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
225 err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
229 err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
233 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
234 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
239 #define X86_MATCH(model) \
240 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
241 INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
243 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
244 X86_MATCH(XEON_PHI_KNL),
245 X86_MATCH(XEON_PHI_KNM),
249 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
250 X86_MATCH(SKYLAKE_X),
254 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
255 X86_MATCH(ATOM_GOLDMONT),
256 X86_MATCH(ATOM_GOLDMONT_D),
257 X86_MATCH(ATOM_GOLDMONT_PLUS),
261 static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
262 int num_delta_fratio)
264 int fratio, delta_fratio, found;
268 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
272 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
274 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
278 fratio = (msr >> 8) & 0xFF;
282 if (found >= num_delta_fratio) {
283 *turbo_freq = fratio;
287 delta_fratio = (msr >> (i + 5)) & 0x7;
291 fratio -= delta_fratio;
300 static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
306 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
310 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
312 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
316 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
320 for (i = 0; i < 64; i += 8) {
321 group_size = (counts >> i) & 0xFF;
322 if (group_size >= size) {
323 *turbo_freq = (ratios >> i) & 0xFF;
331 static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
336 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
340 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
344 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
345 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
347 /* The CPU may have less than 4 cores */
349 *turbo_freq = msr & 0xFF; /* 1C turbo */
354 static bool intel_set_max_freq_ratio(void)
356 u64 base_freq, turbo_freq;
359 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
362 if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
363 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
366 if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
367 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
370 if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
371 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
374 if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
381 * Some hypervisors advertise X86_FEATURE_APERFMPERF
382 * but then fill all MSR's with zeroes.
383 * Some CPUs have turbo boost but don't declare any turbo ratio
384 * in MSR_TURBO_RATIO_LIMIT.
386 if (!base_freq || !turbo_freq) {
387 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
391 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
393 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
397 arch_turbo_freq_ratio = turbo_ratio;
398 arch_set_max_freq_ratio(turbo_disabled());
403 static void init_counter_refs(void)
407 rdmsrl(MSR_IA32_APERF, aperf);
408 rdmsrl(MSR_IA32_MPERF, mperf);
410 this_cpu_write(arch_prev_aperf, aperf);
411 this_cpu_write(arch_prev_mperf, mperf);
414 #ifdef CONFIG_PM_SLEEP
415 static struct syscore_ops freq_invariance_syscore_ops = {
416 .resume = init_counter_refs,
419 static void register_freq_invariance_syscore_ops(void)
421 /* Bail out if registered already. */
422 if (freq_invariance_syscore_ops.node.prev)
425 register_syscore_ops(&freq_invariance_syscore_ops);
428 static inline void register_freq_invariance_syscore_ops(void) {}
431 void init_freq_invariance(bool secondary, bool cppc_ready)
435 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
439 if (static_branch_likely(&arch_scale_freq_key)) {
445 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
446 ret = intel_set_max_freq_ratio();
447 else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
451 ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio);
456 static_branch_enable(&arch_scale_freq_key);
457 register_freq_invariance_syscore_ops();
458 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
460 pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
464 static void disable_freq_invariance_workfn(struct work_struct *work)
466 static_branch_disable(&arch_scale_freq_key);
469 static DECLARE_WORK(disable_freq_invariance_work,
470 disable_freq_invariance_workfn);
472 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
474 void arch_scale_freq_tick(void)
480 if (!arch_scale_freq_invariant())
483 rdmsrl(MSR_IA32_APERF, aperf);
484 rdmsrl(MSR_IA32_MPERF, mperf);
486 acnt = aperf - this_cpu_read(arch_prev_aperf);
487 mcnt = mperf - this_cpu_read(arch_prev_mperf);
489 this_cpu_write(arch_prev_aperf, aperf);
490 this_cpu_write(arch_prev_mperf, mperf);
492 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
495 if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
498 freq_scale = div64_u64(acnt, mcnt);
502 if (freq_scale > SCHED_CAPACITY_SCALE)
503 freq_scale = SCHED_CAPACITY_SCALE;
505 this_cpu_write(arch_freq_scale, freq_scale);
509 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
510 schedule_work(&disable_freq_invariance_work);
512 #endif /* CONFIG_X86_64 && CONFIG_SMP */