Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[sfrench/cifs-2.6.git] / arch / x86 / kvm / pmu.c
index 3f868fed91145308d8b36d47d58069407908582e..02f9e4f245bd0f63f91944453c0639c72c94d250 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 #include <asm/perf_event.h>
+#include <asm/cpu_device_id.h>
 #include "x86.h"
 #include "cpuid.h"
 #include "lapic.h"
 /* This is enough to filter the vast majority of currently defined events. */
 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
 
+struct x86_pmu_capability __read_mostly kvm_pmu_cap;
+EXPORT_SYMBOL_GPL(kvm_pmu_cap);
+
+static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
+       X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+       {}
+};
+
 /* NOTE:
  * - Each perf counter is defined as "struct kvm_pmc";
  * - There are two types of perf counters: general purpose (gp) and fixed.
@@ -34,7 +44,9 @@
  *   However AMD doesn't support fixed-counters;
  * - There are three types of index to access perf counters (PMC):
  *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
- *        has MSR_K7_PERFCTRn.
+ *        has MSR_K7_PERFCTRn and, for families 15H and later,
+ *        MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
+ *        aliased to MSR_K7_PERFCTRn.
  *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
  *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
  *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
@@ -46,7 +58,8 @@
  *        between pmc and perf counters is as the following:
  *        * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
  *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
- *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
+ *          and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
  */
 
 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
@@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 {
        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+       bool skip_pmi = false;
 
        /* Ignore counters that have been reprogrammed already. */
        if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
                return;
 
-       __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+       if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
+               /* Indicate PEBS overflow PMI to guest. */
+               skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+                                             (unsigned long *)&pmu->global_status);
+       } else {
+               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+       }
        kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 
-       if (!pmc->intr)
+       if (!pmc->intr || skip_pmi)
                return;
 
        /*
@@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                                  u64 config, bool exclude_user,
                                  bool exclude_kernel, bool intr)
 {
+       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
        struct perf_event *event;
        struct perf_event_attr attr = {
                .type = type,
@@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .exclude_kernel = exclude_kernel,
                .config = config,
        };
-
-       if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
-               return;
+       bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
 
        attr.sample_period = get_sample_period(pmc, pmc->counter);
 
@@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                 */
                attr.sample_period = 0;
        }
+       if (pebs) {
+               /*
+                * The non-zero precision level of guest event makes the ordinary
+                * guest event becomes a guest PEBS event and triggers the host
+                * PEBS PMI handler to determine whether the PEBS overflow PMI
+                * comes from the host counters or the guest.
+                *
+                * For most PEBS hardware events, the difference in the software
+                * precision levels of guest and host PEBS events will not affect
+                * the accuracy of the PEBS profiling result, because the "event IP"
+                * in the PEBS record is calibrated on the guest side.
+                *
+                * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
+                * could possibly care here is unsupported and needs changes.
+                */
+               attr.precise_ip = 1;
+               if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
+                       attr.precise_ip = 3;
+       }
 
        event = perf_event_create_kernel_counter(&attr, -1, current,
                                                 kvm_perf_overflow, pmc);
@@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
        pmc_to_pmu(pmc)->event_count++;
        clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
        pmc->is_paused = false;
-       pmc->intr = intr;
+       pmc->intr = intr || pebs;
 }
 
 static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
                              get_sample_period(pmc, pmc->counter)))
                return false;
 
+       if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
+           pmc->perf_event->attr.precise_ip)
+               return false;
+
        /* reuse perf_event to serve as pmc_reprogram_counter() does*/
        perf_event_enable(pmc->perf_event);
        pmc->is_paused = false;
@@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb)
        return (a > b) - (a < b);
 }
 
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+static bool check_pmu_event_filter(struct kvm_pmc *pmc)
 {
-       u64 config;
-       u32 type = PERF_TYPE_RAW;
-       struct kvm *kvm = pmc->vcpu->kvm;
        struct kvm_pmu_event_filter *filter;
-       struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
+       struct kvm *kvm = pmc->vcpu->kvm;
        bool allow_event = true;
+       __u64 key;
+       int idx;
 
-       if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
-               printk_once("kvm pmu: pin control bit is ignored\n");
-
-       pmc->eventsel = eventsel;
-
-       pmc_pause_counter(pmc);
-
-       if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
-               return;
+       if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
+               return false;
 
        filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
-       if (filter) {
-               __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+       if (!filter)
+               goto out;
 
+       if (pmc_is_gp(pmc)) {
+               key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
                if (bsearch(&key, filter->events, filter->nevents,
                            sizeof(__u64), cmp_u64))
                        allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
                else
                        allow_event = filter->action == KVM_PMU_EVENT_DENY;
+       } else {
+               idx = pmc->idx - INTEL_PMC_IDX_FIXED;
+               if (filter->action == KVM_PMU_EVENT_DENY &&
+                   test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
+                       allow_event = false;
+               if (filter->action == KVM_PMU_EVENT_ALLOW &&
+                   !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
+                       allow_event = false;
        }
-       if (!allow_event)
-               return;
-
-       if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
-                         ARCH_PERFMON_EVENTSEL_INV |
-                         ARCH_PERFMON_EVENTSEL_CMASK |
-                         HSW_IN_TX |
-                         HSW_IN_TX_CHECKPOINTED))) {
-               config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
-               if (config != PERF_COUNT_HW_MAX)
-                       type = PERF_TYPE_HARDWARE;
-       }
-
-       if (type == PERF_TYPE_RAW)
-               config = eventsel & pmu->raw_event_mask;
-
-       if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
-               return;
-
-       pmc_release_perf_event(pmc);
 
-       pmc->current_config = eventsel;
-       pmc_reprogram_counter(pmc, type, config,
-                             !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
-                             !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-                             eventsel & ARCH_PERFMON_EVENTSEL_INT);
+out:
+       return allow_event;
 }
-EXPORT_SYMBOL_GPL(reprogram_gp_counter);
 
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+void reprogram_counter(struct kvm_pmc *pmc)
 {
-       unsigned en_field = ctrl & 0x3;
-       bool pmi = ctrl & 0x8;
-       struct kvm_pmu_event_filter *filter;
-       struct kvm *kvm = pmc->vcpu->kvm;
+       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+       u64 eventsel = pmc->eventsel;
+       u64 new_config = eventsel;
+       u8 fixed_ctr_ctrl;
 
        pmc_pause_counter(pmc);
 
-       if (!en_field || !pmc_is_enabled(pmc))
+       if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
                return;
 
-       filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
-       if (filter) {
-               if (filter->action == KVM_PMU_EVENT_DENY &&
-                   test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
-                       return;
-               if (filter->action == KVM_PMU_EVENT_ALLOW &&
-                   !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
-                       return;
-       }
-
-       if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
+       if (!check_pmu_event_filter(pmc))
                return;
 
-       pmc_release_perf_event(pmc);
-
-       pmc->current_config = (u64)ctrl;
-       pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
-                             static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
-                             !(en_field & 0x2), /* exclude user */
-                             !(en_field & 0x1), /* exclude kernel */
-                             pmi);
-}
-EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
+       if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+               printk_once("kvm pmu: pin control bit is ignored\n");
 
-void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
-{
-       struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
+       if (pmc_is_fixed(pmc)) {
+               fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+                                                 pmc->idx - INTEL_PMC_IDX_FIXED);
+               if (fixed_ctr_ctrl & 0x1)
+                       eventsel |= ARCH_PERFMON_EVENTSEL_OS;
+               if (fixed_ctr_ctrl & 0x2)
+                       eventsel |= ARCH_PERFMON_EVENTSEL_USR;
+               if (fixed_ctr_ctrl & 0x8)
+                       eventsel |= ARCH_PERFMON_EVENTSEL_INT;
+               new_config = (u64)fixed_ctr_ctrl;
+       }
 
-       if (!pmc)
+       if (pmc->current_config == new_config && pmc_resume_counter(pmc))
                return;
 
-       if (pmc_is_gp(pmc))
-               reprogram_gp_counter(pmc, pmc->eventsel);
-       else {
-               int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
-               u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+       pmc_release_perf_event(pmc);
 
-               reprogram_fixed_counter(pmc, ctrl, idx);
-       }
+       pmc->current_config = new_config;
+       pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+                             (eventsel & pmu->raw_event_mask),
+                             !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+                             !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+                             eventsel & ARCH_PERFMON_EVENTSEL_INT);
 }
 EXPORT_SYMBOL_GPL(reprogram_counter);
 
@@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
                        clear_bit(bit, pmu->reprogram_pmi);
                        continue;
                }
-
-               reprogram_counter(pmu, bit);
+               reprogram_counter(pmc);
        }
 
        /*
@@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
        kvm_pmu_refresh(vcpu);
 }
 
-static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
-{
-       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
-
-       if (pmc_is_fixed(pmc))
-               return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
-                       pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
-
-       return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
-}
-
 /* Release perf_events for vPMCs that have been unused for a full time slice.  */
 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
 {
@@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 
 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
 {
-       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
        u64 prev_count;
 
        prev_count = pmc->counter;
        pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
 
-       reprogram_counter(pmu, pmc->idx);
+       reprogram_counter(pmc);
        if (pmc->counter < prev_count)
                __kvm_perf_overflow(pmc, false);
 }
@@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
 static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
        unsigned int perf_hw_id)
 {
-       u64 old_eventsel = pmc->eventsel;
-       unsigned int config;
-
-       pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
-       config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
-       pmc->eventsel = old_eventsel;
-       return config == perf_hw_id;
+       return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
+               AMD64_RAW_EVENT_MASK_NB);
 }
 
 static inline bool cpl_is_matched(struct kvm_pmc *pmc)