Merge tag 'kvm-x86-misc-6.7' of https://github.com/kvm-x86/linux into HEAD
authorPaolo Bonzini <pbonzini@redhat.com>
Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
KVM x86 misc changes for 6.7:

 - Add CONFIG_KVM_MAX_NR_VCPUS to allow supporting up to 4096 vCPUs without
   forcing more common use cases to eat the extra memory overhead.

 - Add IBPB and SBPB virtualization support.

 - Fix a bug where restoring a vCPU snapshot that was taken within 1 second of
   creating the original vCPU would cause KVM to try to synchronize the vCPU's
   TSC and thus clobber the correct TSC being set by userspace.

 - Compute guest wall clock using a single TSC read to avoid generating an
   inaccurate time, e.g. if the vCPU is preempted between multiple TSC reads.

 - "Virtualize" HWCR.TscFreqSel to make Linux guests happy, which complain
    about a "Firmware Bug" if the bit isn't set for select F/M/S combos.

 - Don't apply side effects to Hyper-V's synthetic timer on writes from
   userspace to fix an issue where the auto-enable behavior can trigger
   spurious interrupts, i.e. do auto-enabling only for guest writes.

 - Remove an unnecessary kick of all vCPUs when synchronizing the dirty log
   without PML enabled.

 - Advertise "support" for non-serializing FS/GS base MSR writes as appropriate.

 - Use octal notation for file permissions through KVM x86.

 - Fix a handful of typo fixes and warts.

15 files changed:
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/msr-index.h
arch/x86/kvm/Kconfig
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/hyperv.c
arch/x86/kvm/smm.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/kvm/xen.c
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c [new file with mode: 0644]

index 58cb9495e40f426323cd65569cd53145e7b2f9e7..4af140cf5719e6d75f5206d614002210bed782ca 100644 (file)
 
 /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
 #define X86_FEATURE_NO_NESTED_DATA_BP  (20*32+ 0) /* "" No Nested Data Breakpoints */
+#define X86_FEATURE_WRMSR_XX_BASE_NS   (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
 #define X86_FEATURE_LFENCE_RDTSC       (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
 #define X86_FEATURE_NULL_SEL_CLR_BASE  (20*32+ 6) /* "" Null Selector Clears Base */
 #define X86_FEATURE_AUTOIBRS           (20*32+ 8) /* "" Automatic IBRS */
index fb9f5fa96cc964d39041e2757773a21754dffc2c..db02305eb9e3d4932dfef07248af33709e0f66d5 100644 (file)
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
+/*
+ * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
+ * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
+ */
+#ifdef CONFIG_KVM_MAX_NR_VCPUS
+#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
+#else
 #define KVM_MAX_VCPUS 1024
+#endif
 
 /*
  * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
@@ -1275,7 +1283,6 @@ struct kvm_arch {
         */
        spinlock_t mmu_unsync_pages_lock;
 
-       struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
        bool iommu_noncoherent;
 #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
@@ -1323,6 +1330,7 @@ struct kvm_arch {
        int nr_vcpus_matched_tsc;
 
        u32 default_tsc_khz;
+       bool user_set_tsc;
 
        seqcount_raw_spinlock_t pvclock_sc;
        bool use_master_clock;
@@ -1691,7 +1699,7 @@ struct kvm_x86_ops {
 
        void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
 
-       void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+       void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
 
        /*
         * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
index b37abb55e948b7665d67c95e830dec3e11bed41c..389f9594746ef58903c59d31a88c2d1e56bd1e25 100644 (file)
 #define MSR_AMD64_CPUID_FN_1           0xc0011004
 #define MSR_AMD64_LS_CFG               0xc0011020
 #define MSR_AMD64_DC_CFG               0xc0011022
+#define MSR_AMD64_TW_CFG               0xc0011023
 
 #define MSR_AMD64_DE_CFG               0xc0011029
 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT   1
index ed90f148140dfe093bed15a033b6e322a3cb5f2a..950c12868d304004ff56e7bc95f7c5f395766a33 100644 (file)
@@ -154,4 +154,15 @@ config KVM_PROVE_MMU
 config KVM_EXTERNAL_WRITE_TRACKING
        bool
 
+config KVM_MAX_NR_VCPUS
+       int "Maximum number of vCPUs per KVM guest"
+       depends on KVM
+       range 1024 4096
+       default 4096 if MAXSMP
+       default 1024
+       help
+         Set the maximum number of vCPUs per KVM guest. Larger values will increase
+         the memory footprint of each KVM guest, regardless of how many vCPUs are
+         created for a given VM.
+
 endif # VIRTUALIZATION
index 773132c3bf5af760827f8dd0ab9601de702744a5..6fb3249ae683be3cdb7b5916986482f4c34e3428 100644 (file)
@@ -753,11 +753,13 @@ void kvm_set_cpu_caps(void)
 
        kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
                F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
-               F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+               F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
+               F(WRMSR_XX_BASE_NS)
        );
 
-       if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
-               kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+       kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
+       kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
+       kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
 
        kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
                F(PERFMON_V2)
index 284fa4704553da1345731927225084e7daeeba4b..0b90532b6e261430c7997e933f59f5531312d627 100644 (file)
@@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
 static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
 {
        return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
-               guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+               guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+               guest_cpuid_has(vcpu, X86_FEATURE_SBPB));
 }
 
 static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
index 7c2dac6824e262e0e82f1ed1d7bd564598006713..238afd7335e46d7fcce8d8ad096136cf35bf70b1 100644 (file)
@@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
 
        stimer_cleanup(stimer);
        stimer->count = count;
-       if (stimer->count == 0)
-               stimer->config.enable = 0;
-       else if (stimer->config.auto_enable)
-               stimer->config.enable = 1;
+       if (!host) {
+               if (stimer->count == 0)
+                       stimer->config.enable = 0;
+               else if (stimer->config.auto_enable)
+                       stimer->config.enable = 1;
+       }
 
        if (stimer->config.enable)
                stimer_mark_pending(stimer, false);
index b42111a24cc28de2b680b9b495d6f3cb256541ef..dc3d95fdca7d337ef4305123b8f439d2103c8b30 100644 (file)
@@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu)
 
        cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
        static_call(kvm_x86_set_cr0)(vcpu, cr0);
-       vcpu->arch.cr0 = cr0;
 
        static_call(kvm_x86_set_cr4)(vcpu, 0);
 
index beea99c8e8e05e3bd3427b3ca4c7245f5ef1a27b..eb234cdd370b647d3186e6fac3f8758af057c12c 100644 (file)
@@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444);
 
 /* allow nested virtualization in KVM/SVM */
 static int nested = true;
-module_param(nested, int, S_IRUGO);
+module_param(nested, int, 0444);
 
 /* enable/disable Next RIP Save */
 int nrips = true;
index 9bba5352582c35615b7c076fb54b4601b8db108e..c721a6785909d60d59a0788190ea1b62485d6ae0 100644 (file)
@@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
 static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+module_param_named(vnmi, enable_vnmi, bool, 0444);
 
 bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
 
 bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, 0444);
 
 bool __read_mostly enable_unrestricted_guest = 1;
 module_param_named(unrestricted_guest,
-                       enable_unrestricted_guest, bool, S_IRUGO);
+                       enable_unrestricted_guest, bool, 0444);
 
 bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
 
 static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+module_param(emulate_invalid_guest_state, bool, 0444);
 
 static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
+module_param(fasteoi, bool, 0444);
 
-module_param(enable_apicv, bool, S_IRUGO);
+module_param(enable_apicv, bool, 0444);
 
 bool __read_mostly enable_ipiv = true;
 module_param(enable_ipiv, bool, 0444);
@@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444);
  * use VMX instructions.
  */
 static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
+module_param(nested, bool, 0444);
 
 bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
+module_param_named(pml, enable_pml, bool, 0444);
 
 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
index 41cce5031126a02602bcc9163d98eb9e58dd8630..4937aa50d40b069d0edf3730bc32d849b1945111 100644 (file)
@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
 
 static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_msrs, bool, 0644);
 
 bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(report_ignored_msrs, bool, 0644);
 EXPORT_SYMBOL_GPL(report_ignored_msrs);
 
 unsigned int min_timer_period_us = 200;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+module_param(min_timer_period_us, uint, 0644);
 
 static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+module_param(kvmclock_periodic_sync, bool, 0444);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+module_param(tsc_tolerance_ppm, uint, 0644);
 
 /*
  * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
  * tuning, i.e. allows privileged userspace to set an exact advancement time.
  */
 static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, 0644);
 
 static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, S_IRUGO);
+module_param(vector_hashing, bool, 0444);
 
 bool __read_mostly enable_vmware_backdoor = false;
-module_param(enable_vmware_backdoor, bool, S_IRUGO);
+module_param(enable_vmware_backdoor, bool, 0444);
 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 
 /*
@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
 module_param(force_emulation_prefix, int, 0644);
 
 int __read_mostly pi_inject_timer = -1;
-module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+module_param(pi_inject_timer, bint, 0644);
 
 /* Enable/disable PMU virtualization */
 bool __read_mostly enable_pmu = true;
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
        if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
                return;
 
-       /*
-        * The guest calculates current wall clock time by adding
-        * system time (updated by kvm_guest_time_update below) to the
-        * wall clock specified here.  We do the reverse here.
-        */
-       wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+       wall_nsec = kvm_get_wall_clock_epoch(kvm);
 
-       wc.nsec = do_div(wall_nsec, 1000000000);
+       wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
        wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
        wc.version = version;
 
@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
        kvm_track_tsc_matching(vcpu);
 }
 
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
 {
+       u64 data = user_value ? *user_value : 0;
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        if (vcpu->arch.virtual_tsc_khz) {
                if (data == 0) {
                        /*
-                        * detection of vcpu initialization -- need to sync
-                        * with other vCPUs. This particularly helps to keep
-                        * kvm_clock stable after CPU hotplug
+                        * Force synchronization when creating a vCPU, or when
+                        * userspace explicitly writes a zero value.
                         */
                        synchronizing = true;
-               } else {
+               } else if (kvm->arch.user_set_tsc) {
                        u64 tsc_exp = kvm->arch.last_tsc_write +
                                                nsec_to_cycles(vcpu, elapsed);
                        u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
                        /*
-                        * Special case: TSC write with a small delta (1 second)
-                        * of virtual cycle time against real time is
-                        * interpreted as an attempt to synchronize the CPU.
+                        * Here lies UAPI baggage: when a user-initiated TSC write has
+                        * a small delta (1 second) of virtual cycle time against the
+                        * previously set vCPU, we assume that they were intended to be
+                        * in sync and the delta was only due to the racy nature of the
+                        * legacy API.
+                        *
+                        * This trick falls down when restoring a guest which genuinely
+                        * has been running for less time than the 1 second of imprecision
+                        * which we allow for in the legacy API. In this case, the first
+                        * value written by userspace (on any vCPU) should not be subject
+                        * to this 'correction' to make it sync up with values that only
+                        * come from the kernel's default vCPU creation. Make the 1-second
+                        * slop hack only trigger if the user_set_tsc flag is already set.
                         */
                        synchronizing = data < tsc_exp + tsc_hz &&
                                        data + tsc_hz > tsc_exp;
                }
        }
 
+       if (user_value)
+               kvm->arch.user_set_tsc = true;
+
        /*
         * For a reliable TSC, we can match TSC offsets, and for an unstable
         * TSC, we add elapsed time in this computation.  We could let the
@@ -3241,6 +3249,82 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        return 0;
 }
 
+/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+       struct pvclock_vcpu_time_info hv_clock;
+       struct kvm_arch *ka = &kvm->arch;
+       unsigned long seq, local_tsc_khz;
+       struct timespec64 ts;
+       uint64_t host_tsc;
+
+       do {
+               seq = read_seqcount_begin(&ka->pvclock_sc);
+
+               local_tsc_khz = 0;
+               if (!ka->use_master_clock)
+                       break;
+
+               /*
+                * The TSC read and the call to get_cpu_tsc_khz() must happen
+                * on the same CPU.
+                */
+               get_cpu();
+
+               local_tsc_khz = get_cpu_tsc_khz();
+
+               if (local_tsc_khz &&
+                   !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+                       local_tsc_khz = 0; /* Fall back to old method */
+
+               put_cpu();
+
+               /*
+                * These values must be snapshotted within the seqcount loop.
+                * After that, it's just mathematics which can happen on any
+                * CPU at any time.
+                */
+               hv_clock.tsc_timestamp = ka->master_cycle_now;
+               hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+       } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+       /*
+        * If the conditions were right, and obtaining the wallclock+TSC was
+        * successful, calculate the KVM clock at the corresponding time and
+        * subtract one from the other to get the guest's epoch in nanoseconds
+        * since 1970-01-01.
+        */
+       if (local_tsc_khz) {
+               kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+                                  &hv_clock.tsc_shift,
+                                  &hv_clock.tsc_to_system_mul);
+               return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+                       __pvclock_read_cycles(&hv_clock, host_tsc);
+       }
+#endif
+       return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
 /*
  * kvmclock updates which are isolated to a given vcpu, such as
  * vcpu->cpu migration, should not allow system_timestamp from
@@ -3290,9 +3374,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                           kvmclock_sync_work);
        struct kvm *kvm = container_of(ka, struct kvm, arch);
 
-       if (!kvmclock_periodic_sync)
-               return;
-
        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
@@ -3641,6 +3722,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_AMD64_PATCH_LOADER:
        case MSR_AMD64_BU_CFG2:
        case MSR_AMD64_DC_CFG:
+       case MSR_AMD64_TW_CFG:
        case MSR_F15H_EX_CFG:
                break;
 
@@ -3670,17 +3752,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.perf_capabilities = data;
                kvm_pmu_refresh(vcpu);
                break;
-       case MSR_IA32_PRED_CMD:
-               if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
-                       return 1;
+       case MSR_IA32_PRED_CMD: {
+               u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+               if (!msr_info->host_initiated) {
+                       if ((!guest_has_pred_cmd_msr(vcpu)))
+                               return 1;
 
-               if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+                           !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+                               reserved_bits |= PRED_CMD_IBPB;
+
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+                               reserved_bits |= PRED_CMD_SBPB;
+               }
+
+               if (!boot_cpu_has(X86_FEATURE_IBPB))
+                       reserved_bits |= PRED_CMD_IBPB;
+
+               if (!boot_cpu_has(X86_FEATURE_SBPB))
+                       reserved_bits |= PRED_CMD_SBPB;
+
+               if (data & reserved_bits)
                        return 1;
+
                if (!data)
                        break;
 
-               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+               wrmsrl(MSR_IA32_PRED_CMD, data);
                break;
+       }
        case MSR_IA32_FLUSH_CMD:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
@@ -3700,13 +3801,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                data &= ~(u64)0x8;      /* ignore TLB cache disable */
 
-               /* Handle McStatusWrEn */
-               if (data == BIT_ULL(18)) {
-                       vcpu->arch.msr_hwcr = data;
-               } else if (data != 0) {
+               /*
+                * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+                * through at least v6.6 whine if TscFreqSel is clear,
+                * depending on F/M/S.
+                */
+               if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
                        kvm_pr_unimpl_wrmsr(vcpu, msr, data);
                        return 1;
                }
+               vcpu->arch.msr_hwcr = data;
                break;
        case MSR_FAM10H_MMIO_CONF_BASE:
                if (data != 0) {
@@ -3777,7 +3881,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_TSC:
                if (msr_info->host_initiated) {
-                       kvm_synchronize_tsc(vcpu, data);
+                       kvm_synchronize_tsc(vcpu, &data);
                } else {
                        u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                        adjust_tsc_offset_guest(vcpu, adj);
@@ -4065,6 +4169,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_AMD64_BU_CFG2:
        case MSR_IA32_PERF_CTL:
        case MSR_AMD64_DC_CFG:
+       case MSR_AMD64_TW_CFG:
        case MSR_F15H_EX_CFG:
        /*
         * Intel Sandy Bridge CPUs must support the RAPL (running average power
@@ -5547,6 +5652,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
                tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
                ns = get_kvmclock_base_ns();
 
+               kvm->arch.user_set_tsc = true;
                __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
                raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
@@ -6259,6 +6365,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
        struct kvm_vcpu *vcpu;
        unsigned long i;
 
+       if (!kvm_x86_ops.cpu_dirty_log_size)
+               return;
+
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vcpu_kick(vcpu);
 }
@@ -11532,7 +11641,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
 
        *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
-       vcpu->arch.cr0 = sregs->cr0;
 
        *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@ -11576,8 +11684,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        if (ret)
                return ret;
 
-       if (mmu_reset_needed)
+       if (mmu_reset_needed) {
                kvm_mmu_reset_context(vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+       }
 
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
@@ -11618,8 +11728,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
                mmu_reset_needed = 1;
                vcpu->arch.pdptrs_from_userspace = true;
        }
-       if (mmu_reset_needed)
+       if (mmu_reset_needed) {
                kvm_mmu_reset_context(vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+       }
        return 0;
 }
 
@@ -11970,7 +12082,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        if (mutex_lock_killable(&vcpu->mutex))
                return;
        vcpu_load(vcpu);
-       kvm_synchronize_tsc(vcpu, 0);
+       kvm_synchronize_tsc(vcpu, NULL);
        vcpu_put(vcpu);
 
        /* poll control enabled by default */
@@ -12326,7 +12438,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                goto out_uninit_mmu;
 
        INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
-       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
index 1e7be1f6ab299d78a76e76385db159dee679220b..5184fde1dc541a90ca150f16d71eb2bd9506b4af 100644 (file)
@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 u64 get_kvmclock_ns(struct kvm *kvm);
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
        gva_t addr, void *val, unsigned int bytes,
index 40edf4d1974c530336e9f9044fd3b18b18ea8de3..b946d9f280306724cf34091a65358fee6217639a 100644 (file)
@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
                 * This code mirrors kvm_write_wall_clock() except that it writes
                 * directly through the pfn cache and doesn't mark the page dirty.
                 */
-               wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+               wall_nsec = kvm_get_wall_clock_epoch(kvm);
 
                /* It could be invalid again already, so we need to check */
                read_lock_irq(&gpc->lock);
@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
        wc_version = wc->version = (wc->version + 1) | 1;
        smp_wmb();
 
-       wc->nsec = do_div(wall_nsec,  1000000000);
+       wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
        wc->sec = (u32)wall_nsec;
        *wc_sec_hi = wall_nsec >> 32;
        smp_wmb();
index a3bb36fb3cfc55a423d31c10e86c1fe16b79ca0e..fb01c3f8d3da2a26c515ab8a1c3d0a86b54d0032 100644 (file)
@@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
 TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
+TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
new file mode 100644 (file)
index 0000000..df351ae
--- /dev/null
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit)
+{
+       const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8);
+       const uint64_t valid = BIT_ULL(18) | BIT_ULL(24);
+       const uint64_t legal = ignored | valid;
+       uint64_t val = BIT_ULL(bit);
+       uint64_t actual;
+       int r;
+
+       r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val);
+       TEST_ASSERT(val & ~legal ? !r : r == 1,
+                   "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s",
+                   val, val & ~legal ? "fail" : "succeed");
+
+       actual = vcpu_get_msr(vcpu, MSR_K7_HWCR);
+       TEST_ASSERT(actual == (val & valid),
+                   "Bit %u: unexpected HWCR 0x%lx; expected 0x%lx",
+                   bit, actual, (val & valid));
+
+       vcpu_set_msr(vcpu, MSR_K7_HWCR, 0);
+}
+
+int main(int argc, char *argv[])
+{
+       struct kvm_vm *vm;
+       struct kvm_vcpu *vcpu;
+       unsigned int bit;
+
+       vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+       for (bit = 0; bit < BITS_PER_LONG; bit++)
+               test_hwcr_bit(vcpu, bit);
+
+       kvm_vm_free(vm);
+}