arch/x86/events/rapl.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Support Intel/AMD RAPL energy consumption counters
   4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
   5  *
   6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   7  * section 14.7.1 (September 2013)
   8  *
   9  * AMD RAPL interface for Fam17h is described in the public PPR:
  10  * https://bugzilla.kernel.org/show_bug.cgi?id=206537
  11  *
  12  * RAPL provides more controls than just reporting energy consumption
  13  * however here we only expose the 3 energy consumption free running
  14  * counters (pp0, pkg, dram).
  15  *
  16  * Each of those counters increments in a power unit defined by the
  17  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  18  * but it can vary.
  19  *
  20  * Counter to rapl events mappings:
  21  *
  22  *  pp0 counter: consumption of all physical cores (power plane 0)
  23  *        event: rapl_energy_cores
  24  *    perf code: 0x1
  25  *
  26  *  pkg counter: consumption of the whole processor package
  27  *        event: rapl_energy_pkg
  28  *    perf code: 0x2
  29  *
  30  * dram counter: consumption of the dram domain (servers only)
  31  *        event: rapl_energy_dram
  32  *    perf code: 0x3
  33  *
  34  * gpu counter: consumption of the builtin-gpu domain (client only)
  35  *        event: rapl_energy_gpu
  36  *    perf code: 0x4
  37  *
  38  *  psys counter: consumption of the builtin-psys domain (client only)
  39  *        event: rapl_energy_psys
  40  *    perf code: 0x5
  41  *
  42  * We manage those counters as free running (read-only). They may be
  43  * use simultaneously by other tools, such as turbostat.
  44  *
  45  * The events only support system-wide mode counting. There is no
  46  * sampling support because it does not make sense and is not
  47  * supported by the RAPL hardware.
  48  *
  49  * Because we want to avoid floating-point operations in the kernel,
  50  * the events are all reported in fixed point arithmetic (32.32).
  51  * Tools must adjust the counts to convert them to Watts using
  52  * the duration of the measurement. Tools may use a function such as
  53  * ldexp(raw_count, -32);
  54  */
  55
  56 #define pr_fmt(fmt) "RAPL PMU: " fmt
  57
  58 #include <linux/module.h>
  59 #include <linux/slab.h>
  60 #include <linux/perf_event.h>
  61 #include <linux/nospec.h>
  62 #include <asm/cpu_device_id.h>
  63 #include <asm/intel-family.h>
  64 #include "perf_event.h"
  65 #include "probe.h"
  66
  67 MODULE_LICENSE("GPL");
  68
  69 /*
  70  * RAPL energy status counters
  71  */
  72 enum perf_rapl_events {
  73         PERF_RAPL_PP0 = 0,              /* all cores */
  74         PERF_RAPL_PKG,                  /* entire package */
  75         PERF_RAPL_RAM,                  /* DRAM */
  76         PERF_RAPL_PP1,                  /* gpu */
  77         PERF_RAPL_PSYS,                 /* psys */
  78
  79         PERF_RAPL_MAX,
  80         NR_RAPL_DOMAINS = PERF_RAPL_MAX,
  81 };
  82
  83 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  84         "pp0-core",
  85         "package",
  86         "dram",
  87         "pp1-gpu",
  88         "psys",
  89 };
  90
  91 /*
  92  * event code: LSB 8 bits, passed in attr->config
  93  * any other bit is reserved
  94  */
  95 #define RAPL_EVENT_MASK 0xFFULL
  96 #define RAPL_CNTR_WIDTH 32
  97
  98 #define RAPL_EVENT_ATTR_STR(_name, v, str)                                      \
  99 static struct perf_pmu_events_attr event_attr_##v = {                           \
 100         .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
 101         .id             = 0,                                                    \
 102         .event_str      = str,                                                  \
 103 };
 104
 105 struct rapl_pmu {
 106         raw_spinlock_t          lock;
 107         int                     n_active;
 108         int                     cpu;
 109         struct list_head        active_list;
 110         struct pmu              *pmu;
 111         ktime_t                 timer_interval;
 112         struct hrtimer          hrtimer;
 113 };
 114
 115 struct rapl_pmus {
 116         struct pmu              pmu;
 117         unsigned int            maxdie;
 118         struct rapl_pmu         *pmus[] __counted_by(maxdie);
 119 };
 120
 121 enum rapl_unit_quirk {
 122         RAPL_UNIT_QUIRK_NONE,
 123         RAPL_UNIT_QUIRK_INTEL_HSW,
 124         RAPL_UNIT_QUIRK_INTEL_SPR,
 125 };
 126
 127 struct rapl_model {
 128         struct perf_msr *rapl_msrs;
 129         unsigned long   events;
 130         unsigned int    msr_power_unit;
 131         enum rapl_unit_quirk    unit_quirk;
 132 };
 133
 134  /* 1/2^hw_unit Joule */
 135 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 136 static struct rapl_pmus *rapl_pmus;
 137 static cpumask_t rapl_cpu_mask;
 138 static unsigned int rapl_cntr_mask;
 139 static u64 rapl_timer_ms;
 140 static struct perf_msr *rapl_msrs;
 141
 142 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 143 {
 144         unsigned int dieid = topology_logical_die_id(cpu);
 145
 146         /*
 147          * The unsigned check also catches the '-1' return value for non
 148          * existent mappings in the topology map.
 149          */
 150         return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
 151 }
 152
 153 static inline u64 rapl_read_counter(struct perf_event *event)
 154 {
 155         u64 raw;
 156         rdmsrl(event->hw.event_base, raw);
 157         return raw;
 158 }
 159
 160 static inline u64 rapl_scale(u64 v, int cfg)
 161 {
 162         if (cfg > NR_RAPL_DOMAINS) {
 163                 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 164                 return v;
 165         }
 166         /*
 167          * scale delta to smallest unit (1/2^32)
 168          * users must then scale back: count * 1/(1e9*2^32) to get Joules
 169          * or use ldexp(count, -32).
 170          * Watts = Joules/Time delta
 171          */
 172         return v << (32 - rapl_hw_unit[cfg - 1]);
 173 }
 174
 175 static u64 rapl_event_update(struct perf_event *event)
 176 {
 177         struct hw_perf_event *hwc = &event->hw;
 178         u64 prev_raw_count, new_raw_count;
 179         s64 delta, sdelta;
 180         int shift = RAPL_CNTR_WIDTH;
 181
 182         prev_raw_count = local64_read(&hwc->prev_count);
 183         do {
 184                 rdmsrl(event->hw.event_base, new_raw_count);
 185         } while (!local64_try_cmpxchg(&hwc->prev_count,
 186                                       &prev_raw_count, new_raw_count));
 187
 188         /*
 189          * Now we have the new raw value and have updated the prev
 190          * timestamp already. We can now calculate the elapsed delta
 191          * (event-)time and add that to the generic event.
 192          *
 193          * Careful, not all hw sign-extends above the physical width
 194          * of the count.
 195          */
 196         delta = (new_raw_count << shift) - (prev_raw_count << shift);
 197         delta >>= shift;
 198
 199         sdelta = rapl_scale(delta, event->hw.config);
 200
 201         local64_add(sdelta, &event->count);
 202
 203         return new_raw_count;
 204 }
 205
 206 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 207 {
 208        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
 209                      HRTIMER_MODE_REL_PINNED);
 210 }
 211
 212 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 213 {
 214         struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 215         struct perf_event *event;
 216         unsigned long flags;
 217
 218         if (!pmu->n_active)
 219                 return HRTIMER_NORESTART;
 220
 221         raw_spin_lock_irqsave(&pmu->lock, flags);
 222
 223         list_for_each_entry(event, &pmu->active_list, active_entry)
 224                 rapl_event_update(event);
 225
 226         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 227
 228         hrtimer_forward_now(hrtimer, pmu->timer_interval);
 229
 230         return HRTIMER_RESTART;
 231 }
 232
 233 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 234 {
 235         struct hrtimer *hr = &pmu->hrtimer;
 236
 237         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 238         hr->function = rapl_hrtimer_handle;
 239 }
 240
 241 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 242                                    struct perf_event *event)
 243 {
 244         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 245                 return;
 246
 247         event->hw.state = 0;
 248
 249         list_add_tail(&event->active_entry, &pmu->active_list);
 250
 251         local64_set(&event->hw.prev_count, rapl_read_counter(event));
 252
 253         pmu->n_active++;
 254         if (pmu->n_active == 1)
 255                 rapl_start_hrtimer(pmu);
 256 }
 257
 258 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 259 {
 260         struct rapl_pmu *pmu = event->pmu_private;
 261         unsigned long flags;
 262
 263         raw_spin_lock_irqsave(&pmu->lock, flags);
 264         __rapl_pmu_event_start(pmu, event);
 265         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 266 }
 267
 268 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 269 {
 270         struct rapl_pmu *pmu = event->pmu_private;
 271         struct hw_perf_event *hwc = &event->hw;
 272         unsigned long flags;
 273
 274         raw_spin_lock_irqsave(&pmu->lock, flags);
 275
 276         /* mark event as deactivated and stopped */
 277         if (!(hwc->state & PERF_HES_STOPPED)) {
 278                 WARN_ON_ONCE(pmu->n_active <= 0);
 279                 pmu->n_active--;
 280                 if (pmu->n_active == 0)
 281                         hrtimer_cancel(&pmu->hrtimer);
 282
 283                 list_del(&event->active_entry);
 284
 285                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 286                 hwc->state |= PERF_HES_STOPPED;
 287         }
 288
 289         /* check if update of sw counter is necessary */
 290         if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 291                 /*
 292                  * Drain the remaining delta count out of a event
 293                  * that we are disabling:
 294                  */
 295                 rapl_event_update(event);
 296                 hwc->state |= PERF_HES_UPTODATE;
 297         }
 298
 299         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 300 }
 301
 302 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 303 {
 304         struct rapl_pmu *pmu = event->pmu_private;
 305         struct hw_perf_event *hwc = &event->hw;
 306         unsigned long flags;
 307
 308         raw_spin_lock_irqsave(&pmu->lock, flags);
 309
 310         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 311
 312         if (mode & PERF_EF_START)
 313                 __rapl_pmu_event_start(pmu, event);
 314
 315         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 316
 317         return 0;
 318 }
 319
 320 static void rapl_pmu_event_del(struct perf_event *event, int flags)
 321 {
 322         rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 323 }
 324
 325 static int rapl_pmu_event_init(struct perf_event *event)
 326 {
 327         u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 328         int bit, ret = 0;
 329         struct rapl_pmu *pmu;
 330
 331         /* only look at RAPL events */
 332         if (event->attr.type != rapl_pmus->pmu.type)
 333                 return -ENOENT;
 334
 335         /* check only supported bits are set */
 336         if (event->attr.config & ~RAPL_EVENT_MASK)
 337                 return -EINVAL;
 338
 339         if (event->cpu < 0)
 340                 return -EINVAL;
 341
 342         event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 343
 344         if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
 345                 return -EINVAL;
 346
 347         cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
 348         bit = cfg - 1;
 349
 350         /* check event supported */
 351         if (!(rapl_cntr_mask & (1 << bit)))
 352                 return -EINVAL;
 353
 354         /* unsupported modes and filters */
 355         if (event->attr.sample_period) /* no sampling */
 356                 return -EINVAL;
 357
 358         /* must be done before validate_group */
 359         pmu = cpu_to_rapl_pmu(event->cpu);
 360         if (!pmu)
 361                 return -EINVAL;
 362         event->cpu = pmu->cpu;
 363         event->pmu_private = pmu;
 364         event->hw.event_base = rapl_msrs[bit].msr;
 365         event->hw.config = cfg;
 366         event->hw.idx = bit;
 367
 368         return ret;
 369 }
 370
 371 static void rapl_pmu_event_read(struct perf_event *event)
 372 {
 373         rapl_event_update(event);
 374 }
 375
 376 static ssize_t rapl_get_attr_cpumask(struct device *dev,
 377                                 struct device_attribute *attr, char *buf)
 378 {
 379         return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
 380 }
 381
 382 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 383
 384 static struct attribute *rapl_pmu_attrs[] = {
 385         &dev_attr_cpumask.attr,
 386         NULL,
 387 };
 388
 389 static struct attribute_group rapl_pmu_attr_group = {
 390         .attrs = rapl_pmu_attrs,
 391 };
 392
 393 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 394 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 395 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 396 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 397 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
 398
 399 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 400 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 401 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 402 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 403 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
 404
 405 /*
 406  * we compute in 0.23 nJ increments regardless of MSR
 407  */
 408 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 409 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 410 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 411 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 412 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
 413
 414 /*
 415  * There are no default events, but we need to create
 416  * "events" group (with empty attrs) before updating
 417  * it with detected events.
 418  */
 419 static struct attribute *attrs_empty[] = {
 420         NULL,
 421 };
 422
 423 static struct attribute_group rapl_pmu_events_group = {
 424         .name = "events",
 425         .attrs = attrs_empty,
 426 };
 427
 428 PMU_FORMAT_ATTR(event, "config:0-7");
 429 static struct attribute *rapl_formats_attr[] = {
 430         &format_attr_event.attr,
 431         NULL,
 432 };
 433
 434 static struct attribute_group rapl_pmu_format_group = {
 435         .name = "format",
 436         .attrs = rapl_formats_attr,
 437 };
 438
 439 static const struct attribute_group *rapl_attr_groups[] = {
 440         &rapl_pmu_attr_group,
 441         &rapl_pmu_format_group,
 442         &rapl_pmu_events_group,
 443         NULL,
 444 };
 445
 446 static struct attribute *rapl_events_cores[] = {
 447         EVENT_PTR(rapl_cores),
 448         EVENT_PTR(rapl_cores_unit),
 449         EVENT_PTR(rapl_cores_scale),
 450         NULL,
 451 };
 452
 453 static struct attribute_group rapl_events_cores_group = {
 454         .name  = "events",
 455         .attrs = rapl_events_cores,
 456 };
 457
 458 static struct attribute *rapl_events_pkg[] = {
 459         EVENT_PTR(rapl_pkg),
 460         EVENT_PTR(rapl_pkg_unit),
 461         EVENT_PTR(rapl_pkg_scale),
 462         NULL,
 463 };
 464
 465 static struct attribute_group rapl_events_pkg_group = {
 466         .name  = "events",
 467         .attrs = rapl_events_pkg,
 468 };
 469
 470 static struct attribute *rapl_events_ram[] = {
 471         EVENT_PTR(rapl_ram),
 472         EVENT_PTR(rapl_ram_unit),
 473         EVENT_PTR(rapl_ram_scale),
 474         NULL,
 475 };
 476
 477 static struct attribute_group rapl_events_ram_group = {
 478         .name  = "events",
 479         .attrs = rapl_events_ram,
 480 };
 481
 482 static struct attribute *rapl_events_gpu[] = {
 483         EVENT_PTR(rapl_gpu),
 484         EVENT_PTR(rapl_gpu_unit),
 485         EVENT_PTR(rapl_gpu_scale),
 486         NULL,
 487 };
 488
 489 static struct attribute_group rapl_events_gpu_group = {
 490         .name  = "events",
 491         .attrs = rapl_events_gpu,
 492 };
 493
 494 static struct attribute *rapl_events_psys[] = {
 495         EVENT_PTR(rapl_psys),
 496         EVENT_PTR(rapl_psys_unit),
 497         EVENT_PTR(rapl_psys_scale),
 498         NULL,
 499 };
 500
 501 static struct attribute_group rapl_events_psys_group = {
 502         .name  = "events",
 503         .attrs = rapl_events_psys,
 504 };
 505
 506 static bool test_msr(int idx, void *data)
 507 {
 508         return test_bit(idx, (unsigned long *) data);
 509 }
 510
 511 /* Only lower 32bits of the MSR represents the energy counter */
 512 #define RAPL_MSR_MASK 0xFFFFFFFF
 513
 514 static struct perf_msr intel_rapl_msrs[] = {
 515         [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
 516         [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
 517         [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
 518         [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
 519         [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
 520 };
 521
 522 static struct perf_msr intel_rapl_spr_msrs[] = {
 523         [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
 524         [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
 525         [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
 526         [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
 527         [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
 528 };
 529
 530 /*
 531  * Force to PERF_RAPL_MAX size due to:
 532  * - perf_msr_probe(PERF_RAPL_MAX)
 533  * - want to use same event codes across both architectures
 534  */
 535 static struct perf_msr amd_rapl_msrs[] = {
 536         [PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
 537         [PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
 538         [PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
 539         [PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   NULL, false, 0 },
 540         [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
 541 };
 542
 543 static int rapl_cpu_offline(unsigned int cpu)
 544 {
 545         struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 546         int target;
 547
 548         /* Check if exiting cpu is used for collecting rapl events */
 549         if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
 550                 return 0;
 551
 552         pmu->cpu = -1;
 553         /* Find a new cpu to collect rapl events */
 554         target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
 555
 556         /* Migrate rapl events to the new target */
 557         if (target < nr_cpu_ids) {
 558                 cpumask_set_cpu(target, &rapl_cpu_mask);
 559                 pmu->cpu = target;
 560                 perf_pmu_migrate_context(pmu->pmu, cpu, target);
 561         }
 562         return 0;
 563 }
 564
 565 static int rapl_cpu_online(unsigned int cpu)
 566 {
 567         struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 568         int target;
 569
 570         if (!pmu) {
 571                 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 572                 if (!pmu)
 573                         return -ENOMEM;
 574
 575                 raw_spin_lock_init(&pmu->lock);
 576                 INIT_LIST_HEAD(&pmu->active_list);
 577                 pmu->pmu = &rapl_pmus->pmu;
 578                 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 579                 rapl_hrtimer_init(pmu);
 580
 581                 rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
 582         }
 583
 584         /*
 585          * Check if there is an online cpu in the package which collects rapl
 586          * events already.
 587          */
 588         target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
 589         if (target < nr_cpu_ids)
 590                 return 0;
 591
 592         cpumask_set_cpu(cpu, &rapl_cpu_mask);
 593         pmu->cpu = cpu;
 594         return 0;
 595 }
 596
 597 static int rapl_check_hw_unit(struct rapl_model *rm)
 598 {
 599         u64 msr_rapl_power_unit_bits;
 600         int i;
 601
 602         /* protect rdmsrl() to handle virtualization */
 603         if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
 604                 return -1;
 605         for (i = 0; i < NR_RAPL_DOMAINS; i++)
 606                 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 607
 608         switch (rm->unit_quirk) {
 609         /*
 610          * DRAM domain on HSW server and KNL has fixed energy unit which can be
 611          * different than the unit from power unit MSR. See
 612          * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 613          * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 614          */
 615         case RAPL_UNIT_QUIRK_INTEL_HSW:
 616                 rapl_hw_unit[PERF_RAPL_RAM] = 16;
 617                 break;
 618         /* SPR uses a fixed energy unit for Psys domain. */
 619         case RAPL_UNIT_QUIRK_INTEL_SPR:
 620                 rapl_hw_unit[PERF_RAPL_PSYS] = 0;
 621                 break;
 622         default:
 623                 break;
 624         }
 625
 626
 627         /*
 628          * Calculate the timer rate:
 629          * Use reference of 200W for scaling the timeout to avoid counter
 630          * overflows. 200W = 200 Joules/sec
 631          * Divide interval by 2 to avoid lockstep (2 * 100)
 632          * if hw unit is 32, then we use 2 ms 1/200/2
 633          */
 634         rapl_timer_ms = 2;
 635         if (rapl_hw_unit[0] < 32) {
 636                 rapl_timer_ms = (1000 / (2 * 100));
 637                 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
 638         }
 639         return 0;
 640 }
 641
 642 static void __init rapl_advertise(void)
 643 {
 644         int i;
 645
 646         pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
 647                 hweight32(rapl_cntr_mask), rapl_timer_ms);
 648
 649         for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 650                 if (rapl_cntr_mask & (1 << i)) {
 651                         pr_info("hw unit of domain %s 2^-%d Joules\n",
 652                                 rapl_domain_names[i], rapl_hw_unit[i]);
 653                 }
 654         }
 655 }
 656
 657 static void cleanup_rapl_pmus(void)
 658 {
 659         int i;
 660
 661         for (i = 0; i < rapl_pmus->maxdie; i++)
 662                 kfree(rapl_pmus->pmus[i]);
 663         kfree(rapl_pmus);
 664 }
 665
 666 static const struct attribute_group *rapl_attr_update[] = {
 667         &rapl_events_cores_group,
 668         &rapl_events_pkg_group,
 669         &rapl_events_ram_group,
 670         &rapl_events_gpu_group,
 671         &rapl_events_psys_group,
 672         NULL,
 673 };
 674
 675 static int __init init_rapl_pmus(void)
 676 {
 677         int maxdie = topology_max_packages() * topology_max_die_per_package();
 678         size_t size;
 679
 680         size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
 681         rapl_pmus = kzalloc(size, GFP_KERNEL);
 682         if (!rapl_pmus)
 683                 return -ENOMEM;
 684
 685         rapl_pmus->maxdie               = maxdie;
 686         rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
 687         rapl_pmus->pmu.attr_update      = rapl_attr_update;
 688         rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
 689         rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
 690         rapl_pmus->pmu.add              = rapl_pmu_event_add;
 691         rapl_pmus->pmu.del              = rapl_pmu_event_del;
 692         rapl_pmus->pmu.start            = rapl_pmu_event_start;
 693         rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
 694         rapl_pmus->pmu.read             = rapl_pmu_event_read;
 695         rapl_pmus->pmu.module           = THIS_MODULE;
 696         rapl_pmus->pmu.capabilities     = PERF_PMU_CAP_NO_EXCLUDE;
 697         return 0;
 698 }
 699
 700 static struct rapl_model model_snb = {
 701         .events         = BIT(PERF_RAPL_PP0) |
 702                           BIT(PERF_RAPL_PKG) |
 703                           BIT(PERF_RAPL_PP1),
 704         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 705         .rapl_msrs      = intel_rapl_msrs,
 706 };
 707
 708 static struct rapl_model model_snbep = {
 709         .events         = BIT(PERF_RAPL_PP0) |
 710                           BIT(PERF_RAPL_PKG) |
 711                           BIT(PERF_RAPL_RAM),
 712         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 713         .rapl_msrs      = intel_rapl_msrs,
 714 };
 715
 716 static struct rapl_model model_hsw = {
 717         .events         = BIT(PERF_RAPL_PP0) |
 718                           BIT(PERF_RAPL_PKG) |
 719                           BIT(PERF_RAPL_RAM) |
 720                           BIT(PERF_RAPL_PP1),
 721         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 722         .rapl_msrs      = intel_rapl_msrs,
 723 };
 724
 725 static struct rapl_model model_hsx = {
 726         .events         = BIT(PERF_RAPL_PP0) |
 727                           BIT(PERF_RAPL_PKG) |
 728                           BIT(PERF_RAPL_RAM),
 729         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
 730         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 731         .rapl_msrs      = intel_rapl_msrs,
 732 };
 733
 734 static struct rapl_model model_knl = {
 735         .events         = BIT(PERF_RAPL_PKG) |
 736                           BIT(PERF_RAPL_RAM),
 737         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
 738         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 739         .rapl_msrs      = intel_rapl_msrs,
 740 };
 741
 742 static struct rapl_model model_skl = {
 743         .events         = BIT(PERF_RAPL_PP0) |
 744                           BIT(PERF_RAPL_PKG) |
 745                           BIT(PERF_RAPL_RAM) |
 746                           BIT(PERF_RAPL_PP1) |
 747                           BIT(PERF_RAPL_PSYS),
 748         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 749         .rapl_msrs      = intel_rapl_msrs,
 750 };
 751
 752 static struct rapl_model model_spr = {
 753         .events         = BIT(PERF_RAPL_PP0) |
 754                           BIT(PERF_RAPL_PKG) |
 755                           BIT(PERF_RAPL_RAM) |
 756                           BIT(PERF_RAPL_PSYS),
 757         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_SPR,
 758         .msr_power_unit = MSR_RAPL_POWER_UNIT,
 759         .rapl_msrs      = intel_rapl_spr_msrs,
 760 };
 761
 762 static struct rapl_model model_amd_hygon = {
 763         .events         = BIT(PERF_RAPL_PKG),
 764         .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
 765         .rapl_msrs      = amd_rapl_msrs,
 766 };
 767
 768 static const struct x86_cpu_id rapl_model_match[] __initconst = {
 769         X86_MATCH_FEATURE(X86_FEATURE_RAPL,             &model_amd_hygon),
 770         X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,         &model_snb),
 771         X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,       &model_snbep),
 772         X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,           &model_snb),
 773         X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,         &model_snbep),
 774         X86_MATCH_INTEL_FAM6_MODEL(HASWELL,             &model_hsw),
 775         X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,           &model_hsx),
 776         X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,           &model_hsw),
 777         X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,           &model_hsw),
 778         X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,           &model_hsw),
 779         X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,         &model_hsw),
 780         X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,         &model_hsx),
 781         X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,         &model_hsx),
 782         X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,        &model_knl),
 783         X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,        &model_knl),
 784         X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,           &model_skl),
 785         X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,             &model_skl),
 786         X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,           &model_hsx),
 787         X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,          &model_skl),
 788         X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,            &model_skl),
 789         X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,        &model_skl),
 790         X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,       &model_hsw),
 791         X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,     &model_hsw),
 792         X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,  &model_hsw),
 793         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,           &model_skl),
 794         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,             &model_skl),
 795         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,           &model_hsx),
 796         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,           &model_hsx),
 797         X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,         &model_skl),
 798         X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,           &model_skl),
 799         X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,         &model_skl),
 800         X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,           &model_skl),
 801         X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,           &model_skl),
 802         X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,         &model_skl),
 803         X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,      &model_skl),
 804         X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,    &model_spr),
 805         X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X,     &model_spr),
 806         X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,          &model_skl),
 807         X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,        &model_skl),
 808         X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S,        &model_skl),
 809         X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE,          &model_skl),
 810         X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L,        &model_skl),
 811         {},
 812 };
 813 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
 814
 815 static int __init rapl_pmu_init(void)
 816 {
 817         const struct x86_cpu_id *id;
 818         struct rapl_model *rm;
 819         int ret;
 820
 821         id = x86_match_cpu(rapl_model_match);
 822         if (!id)
 823                 return -ENODEV;
 824
 825         rm = (struct rapl_model *) id->driver_data;
 826
 827         rapl_msrs = rm->rapl_msrs;
 828
 829         rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
 830                                         false, (void *) &rm->events);
 831
 832         ret = rapl_check_hw_unit(rm);
 833         if (ret)
 834                 return ret;
 835
 836         ret = init_rapl_pmus();
 837         if (ret)
 838                 return ret;
 839
 840         /*
 841          * Install callbacks. Core will call them for each online cpu.
 842          */
 843         ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
 844                                 "perf/x86/rapl:online",
 845                                 rapl_cpu_online, rapl_cpu_offline);
 846         if (ret)
 847                 goto out;
 848
 849         ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 850         if (ret)
 851                 goto out1;
 852
 853         rapl_advertise();
 854         return 0;
 855
 856 out1:
 857         cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 858 out:
 859         pr_warn("Initialization failed (%d), disabled\n", ret);
 860         cleanup_rapl_pmus();
 861         return ret;
 862 }
 863 module_init(rapl_pmu_init);
 864
 865 static void __exit intel_rapl_exit(void)
 866 {
 867         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 868         perf_pmu_unregister(&rapl_pmus->pmu);
 869         cleanup_rapl_pmus();
 870 }
 871 module_exit(intel_rapl_exit);