Merge tag 'kvm-x86-misc-6.7' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <pbonzini@redhat.com>

Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

index 58cb9495e40f426323cd65569cd53145e7b2f9e7..4af140cf5719e6d75f5206d614002210bed782ca 100644 (file)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -443,6 +443,7 @@
  
  /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
  #define X86_FEATURE_NO_NESTED_DATA_BP  (20*32+ 0) /* "" No Nested Data Breakpoints */
+#define X86_FEATURE_WRMSR_XX_BASE_NS   (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
  #define X86_FEATURE_LFENCE_RDTSC       (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
  #define X86_FEATURE_NULL_SEL_CLR_BASE  (20*32+ 6) /* "" Null Selector Clears Base */
  #define X86_FEATURE_AUTOIBRS           (20*32+ 8) /* "" Automatic IBRS */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index fb9f5fa96cc964d39041e2757773a21754dffc2c..db02305eb9e3d4932dfef07248af33709e0f66d5 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -39,7 +39,15 @@
  
  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
  
+/*
+ * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
+ * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
+ */
+#ifdef CONFIG_KVM_MAX_NR_VCPUS
+#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
+#else
  #define KVM_MAX_VCPUS 1024
+#endif
  
  /*
   * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
@@ -1275,7 +1283,6 @@ struct kvm_arch {
          */
         spinlock_t mmu_unsync_pages_lock;
  
-       struct list_head assigned_dev_head;
         struct iommu_domain *iommu_domain;
         bool iommu_noncoherent;
  #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
@@ -1323,6 +1330,7 @@ struct kvm_arch {
         int nr_vcpus_matched_tsc;
  
         u32 default_tsc_khz;
+       bool user_set_tsc;
  
         seqcount_raw_spinlock_t pvclock_sc;
         bool use_master_clock;
@@ -1691,7 +1699,7 @@ struct kvm_x86_ops {
  
         void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
  
-       void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+       void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
  
         /*
          * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

index b37abb55e948b7665d67c95e830dec3e11bed41c..389f9594746ef58903c59d31a88c2d1e56bd1e25 100644 (file)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -553,6 +553,7 @@
  #define MSR_AMD64_CPUID_FN_1           0xc0011004
  #define MSR_AMD64_LS_CFG               0xc0011020
  #define MSR_AMD64_DC_CFG               0xc0011022
+#define MSR_AMD64_TW_CFG               0xc0011023
  
  #define MSR_AMD64_DE_CFG               0xc0011029
  #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT   1
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig

index ed90f148140dfe093bed15a033b6e322a3cb5f2a..950c12868d304004ff56e7bc95f7c5f395766a33 100644 (file)
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -154,4 +154,15 @@ config KVM_PROVE_MMU
  config KVM_EXTERNAL_WRITE_TRACKING
         bool
  
+config KVM_MAX_NR_VCPUS
+       int "Maximum number of vCPUs per KVM guest"
+       depends on KVM
+       range 1024 4096
+       default 4096 if MAXSMP
+       default 1024
+       help
+         Set the maximum number of vCPUs per KVM guest. Larger values will increase
+         the memory footprint of each KVM guest, regardless of how many vCPUs are
+         created for a given VM.
+
  endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 773132c3bf5af760827f8dd0ab9601de702744a5..6fb3249ae683be3cdb7b5916986482f4c34e3428 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -753,11 +753,13 @@ void kvm_set_cpu_caps(void)
  
         kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
                 F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
-               F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+               F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
+               F(WRMSR_XX_BASE_NS)
         );
  
-       if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
-               kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+       kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
+       kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
+       kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
  
         kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
                 F(PERFMON_V2)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h

index 284fa4704553da1345731927225084e7daeeba4b..0b90532b6e261430c7997e933f59f5531312d627 100644 (file)
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
  static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
  {
         return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
-               guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+               guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+               guest_cpuid_has(vcpu, X86_FEATURE_SBPB));
  }
  
  static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c

index 7c2dac6824e262e0e82f1ed1d7bd564598006713..238afd7335e46d7fcce8d8ad096136cf35bf70b1 100644 (file)
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
  
         stimer_cleanup(stimer);
         stimer->count = count;
-       if (stimer->count == 0)
-               stimer->config.enable = 0;
-       else if (stimer->config.auto_enable)
-               stimer->config.enable = 1;
+       if (!host) {
+               if (stimer->count == 0)
+                       stimer->config.enable = 0;
+               else if (stimer->config.auto_enable)
+                       stimer->config.enable = 1;
+       }
  
         if (stimer->config.enable)
                 stimer_mark_pending(stimer, false);
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c

index b42111a24cc28de2b680b9b495d6f3cb256541ef..dc3d95fdca7d337ef4305123b8f439d2103c8b30 100644 (file)
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu)
  
         cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
         static_call(kvm_x86_set_cr0)(vcpu, cr0);
-       vcpu->arch.cr0 = cr0;
  
         static_call(kvm_x86_set_cr4)(vcpu, 0);
  
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index beea99c8e8e05e3bd3427b3ca4c7245f5ef1a27b..eb234cdd370b647d3186e6fac3f8758af057c12c 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444);
  
  /* allow nested virtualization in KVM/SVM */
  static int nested = true;
-module_param(nested, int, S_IRUGO);
+module_param(nested, int, 0444);
  
  /* enable/disable Next RIP Save */
  int nrips = true;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 9bba5352582c35615b7c076fb54b4601b8db108e..c721a6785909d60d59a0788190ea1b62485d6ae0 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
  
  static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+module_param_named(vnmi, enable_vnmi, bool, 0444);
  
  bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
  
  bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, 0444);
  
  bool __read_mostly enable_unrestricted_guest = 1;
  module_param_named(unrestricted_guest,
-                       enable_unrestricted_guest, bool, S_IRUGO);
+                       enable_unrestricted_guest, bool, 0444);
  
  bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
  
  static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+module_param(emulate_invalid_guest_state, bool, 0444);
  
  static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
+module_param(fasteoi, bool, 0444);
  
-module_param(enable_apicv, bool, S_IRUGO);
+module_param(enable_apicv, bool, 0444);
  
  bool __read_mostly enable_ipiv = true;
  module_param(enable_ipiv, bool, 0444);
@@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444);
   * use VMX instructions.
   */
  static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
+module_param(nested, bool, 0444);
  
  bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
+module_param_named(pml, enable_pml, bool, 0444);
  
  static bool __read_mostly error_on_inconsistent_vmcs_config = true;
  module_param(error_on_inconsistent_vmcs_config, bool, 0444);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 41cce5031126a02602bcc9163d98eb9e58dd8630..4937aa50d40b069d0edf3730bc32d849b1945111 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
  EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
  
  static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_msrs, bool, 0644);
  
  bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(report_ignored_msrs, bool, 0644);
  EXPORT_SYMBOL_GPL(report_ignored_msrs);
  
  unsigned int min_timer_period_us = 200;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+module_param(min_timer_period_us, uint, 0644);
  
  static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+module_param(kvmclock_periodic_sync, bool, 0444);
  
  /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
  static u32 __read_mostly tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+module_param(tsc_tolerance_ppm, uint, 0644);
  
  /*
   * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
   * tuning, i.e. allows privileged userspace to set an exact advancement time.
   */
  static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, 0644);
  
  static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, S_IRUGO);
+module_param(vector_hashing, bool, 0444);
  
  bool __read_mostly enable_vmware_backdoor = false;
-module_param(enable_vmware_backdoor, bool, S_IRUGO);
+module_param(enable_vmware_backdoor, bool, 0444);
  EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
  
  /*
@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
  module_param(force_emulation_prefix, int, 0644);
  
  int __read_mostly pi_inject_timer = -1;
-module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+module_param(pi_inject_timer, bint, 0644);
  
  /* Enable/disable PMU virtualization */
  bool __read_mostly enable_pmu = true;
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
                 return;
  
-       /*
-        * The guest calculates current wall clock time by adding
-        * system time (updated by kvm_guest_time_update below) to the
-        * wall clock specified here.  We do the reverse here.
-        */
-       wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+       wall_nsec = kvm_get_wall_clock_epoch(kvm);
  
-       wc.nsec = do_div(wall_nsec, 1000000000);
+       wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
         wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
         wc.version = version;
  
@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
         kvm_track_tsc_matching(vcpu);
  }
  
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
  {
+       u64 data = user_value ? *user_value : 0;
         struct kvm *kvm = vcpu->kvm;
         u64 offset, ns, elapsed;
         unsigned long flags;
@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
         if (vcpu->arch.virtual_tsc_khz) {
                 if (data == 0) {
                         /*
-                        * detection of vcpu initialization -- need to sync
-                        * with other vCPUs. This particularly helps to keep
-                        * kvm_clock stable after CPU hotplug
+                        * Force synchronization when creating a vCPU, or when
+                        * userspace explicitly writes a zero value.
                          */
                         synchronizing = true;
-               } else {
+               } else if (kvm->arch.user_set_tsc) {
                         u64 tsc_exp = kvm->arch.last_tsc_write +
                                                 nsec_to_cycles(vcpu, elapsed);
                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
                         /*
-                        * Special case: TSC write with a small delta (1 second)
-                        * of virtual cycle time against real time is
-                        * interpreted as an attempt to synchronize the CPU.
+                        * Here lies UAPI baggage: when a user-initiated TSC write has
+                        * a small delta (1 second) of virtual cycle time against the
+                        * previously set vCPU, we assume that they were intended to be
+                        * in sync and the delta was only due to the racy nature of the
+                        * legacy API.
+                        *
+                        * This trick falls down when restoring a guest which genuinely
+                        * has been running for less time than the 1 second of imprecision
+                        * which we allow for in the legacy API. In this case, the first
+                        * value written by userspace (on any vCPU) should not be subject
+                        * to this 'correction' to make it sync up with values that only
+                        * come from the kernel's default vCPU creation. Make the 1-second
+                        * slop hack only trigger if the user_set_tsc flag is already set.
                          */
                         synchronizing = data < tsc_exp + tsc_hz &&
                                         data + tsc_hz > tsc_exp;
                 }
         }
  
+       if (user_value)
+               kvm->arch.user_set_tsc = true;
+
         /*
          * For a reliable TSC, we can match TSC offsets, and for an unstable
          * TSC, we add elapsed time in this computation.  We could let the
@@ -3241,6 +3249,82 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         return 0;
  }
  
+/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+       struct pvclock_vcpu_time_info hv_clock;
+       struct kvm_arch *ka = &kvm->arch;
+       unsigned long seq, local_tsc_khz;
+       struct timespec64 ts;
+       uint64_t host_tsc;
+
+       do {
+               seq = read_seqcount_begin(&ka->pvclock_sc);
+
+               local_tsc_khz = 0;
+               if (!ka->use_master_clock)
+                       break;
+
+               /*
+                * The TSC read and the call to get_cpu_tsc_khz() must happen
+                * on the same CPU.
+                */
+               get_cpu();
+
+               local_tsc_khz = get_cpu_tsc_khz();
+
+               if (local_tsc_khz &&
+                   !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+                       local_tsc_khz = 0; /* Fall back to old method */
+
+               put_cpu();
+
+               /*
+                * These values must be snapshotted within the seqcount loop.
+                * After that, it's just mathematics which can happen on any
+                * CPU at any time.
+                */
+               hv_clock.tsc_timestamp = ka->master_cycle_now;
+               hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+       } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+       /*
+        * If the conditions were right, and obtaining the wallclock+TSC was
+        * successful, calculate the KVM clock at the corresponding time and
+        * subtract one from the other to get the guest's epoch in nanoseconds
+        * since 1970-01-01.
+        */
+       if (local_tsc_khz) {
+               kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+                                  &hv_clock.tsc_shift,
+                                  &hv_clock.tsc_to_system_mul);
+               return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+                       __pvclock_read_cycles(&hv_clock, host_tsc);
+       }
+#endif
+       return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
  /*
   * kvmclock updates which are isolated to a given vcpu, such as
   * vcpu->cpu migration, should not allow system_timestamp from
@@ -3290,9 +3374,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                            kvmclock_sync_work);
         struct kvm *kvm = container_of(ka, struct kvm, arch);
  
-       if (!kvmclock_periodic_sync)
-               return;
-
         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                         KVMCLOCK_SYNC_PERIOD);
@@ -3641,6 +3722,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_AMD64_PATCH_LOADER:
         case MSR_AMD64_BU_CFG2:
         case MSR_AMD64_DC_CFG:
+       case MSR_AMD64_TW_CFG:
         case MSR_F15H_EX_CFG:
                 break;
  
@@ -3670,17 +3752,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 vcpu->arch.perf_capabilities = data;
                 kvm_pmu_refresh(vcpu);
                 break;
-       case MSR_IA32_PRED_CMD:
-               if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
-                       return 1;
+       case MSR_IA32_PRED_CMD: {
+               u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+               if (!msr_info->host_initiated) {
+                       if ((!guest_has_pred_cmd_msr(vcpu)))
+                               return 1;
  
-               if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+                           !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+                               reserved_bits |= PRED_CMD_IBPB;
+
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+                               reserved_bits |= PRED_CMD_SBPB;
+               }
+
+               if (!boot_cpu_has(X86_FEATURE_IBPB))
+                       reserved_bits |= PRED_CMD_IBPB;
+
+               if (!boot_cpu_has(X86_FEATURE_SBPB))
+                       reserved_bits |= PRED_CMD_SBPB;
+
+               if (data & reserved_bits)
                         return 1;
+
                 if (!data)
                         break;
  
-               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+               wrmsrl(MSR_IA32_PRED_CMD, data);
                 break;
+       }
         case MSR_IA32_FLUSH_CMD:
                 if (!msr_info->host_initiated &&
                     !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
@@ -3700,13 +3801,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
  
-               /* Handle McStatusWrEn */
-               if (data == BIT_ULL(18)) {
-                       vcpu->arch.msr_hwcr = data;
-               } else if (data != 0) {
+               /*
+                * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+                * through at least v6.6 whine if TscFreqSel is clear,
+                * depending on F/M/S.
+                */
+               if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
                         kvm_pr_unimpl_wrmsr(vcpu, msr, data);
                         return 1;
                 }
+               vcpu->arch.msr_hwcr = data;
                 break;
         case MSR_FAM10H_MMIO_CONF_BASE:
                 if (data != 0) {
@@ -3777,7 +3881,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
         case MSR_IA32_TSC:
                 if (msr_info->host_initiated) {
-                       kvm_synchronize_tsc(vcpu, data);
+                       kvm_synchronize_tsc(vcpu, &data);
                 } else {
                         u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                         adjust_tsc_offset_guest(vcpu, adj);
@@ -4065,6 +4169,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_AMD64_BU_CFG2:
         case MSR_IA32_PERF_CTL:
         case MSR_AMD64_DC_CFG:
+       case MSR_AMD64_TW_CFG:
         case MSR_F15H_EX_CFG:
         /*
          * Intel Sandy Bridge CPUs must support the RAPL (running average power
@@ -5547,6 +5652,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
                 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
                 ns = get_kvmclock_base_ns();
  
+               kvm->arch.user_set_tsc = true;
                 __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
                 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
@@ -6259,6 +6365,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
         struct kvm_vcpu *vcpu;
         unsigned long i;
  
+       if (!kvm_x86_ops.cpu_dirty_log_size)
+               return;
+
         kvm_for_each_vcpu(i, vcpu, kvm)
                 kvm_vcpu_kick(vcpu);
  }
@@ -11532,7 +11641,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
  
         *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
         static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
-       vcpu->arch.cr0 = sregs->cr0;
  
         *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
         static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@ -11576,8 +11684,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
         if (ret)
                 return ret;
  
-       if (mmu_reset_needed)
+       if (mmu_reset_needed) {
                 kvm_mmu_reset_context(vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+       }
  
         max_bits = KVM_NR_INTERRUPTS;
         pending_vec = find_first_bit(
@@ -11618,8 +11728,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
                 mmu_reset_needed = 1;
                 vcpu->arch.pdptrs_from_userspace = true;
         }
-       if (mmu_reset_needed)
+       if (mmu_reset_needed) {
                 kvm_mmu_reset_context(vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+       }
         return 0;
  }
  
@@ -11970,7 +12082,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
         if (mutex_lock_killable(&vcpu->mutex))
                 return;
         vcpu_load(vcpu);
-       kvm_synchronize_tsc(vcpu, 0);
+       kvm_synchronize_tsc(vcpu, NULL);
         vcpu_put(vcpu);
  
         /* poll control enabled by default */
@@ -12326,7 +12438,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                 goto out_uninit_mmu;
  
         INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
-       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
         atomic_set(&kvm->arch.noncoherent_dma_count, 0);
  
         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 1e7be1f6ab299d78a76e76385db159dee679220b..5184fde1dc541a90ca150f16d71eb2bd9506b4af 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
  void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
  
  u64 get_kvmclock_ns(struct kvm *kvm);
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
  
  int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
         gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c

index 40edf4d1974c530336e9f9044fd3b18b18ea8de3..b946d9f280306724cf34091a65358fee6217639a 100644 (file)
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
                  * This code mirrors kvm_write_wall_clock() except that it writes
                  * directly through the pfn cache and doesn't mark the page dirty.
                  */
-               wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+               wall_nsec = kvm_get_wall_clock_epoch(kvm);
  
                 /* It could be invalid again already, so we need to check */
                 read_lock_irq(&gpc->lock);
@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
         wc_version = wc->version = (wc->version + 1) | 1;
         smp_wmb();
  
-       wc->nsec = do_div(wall_nsec,  1000000000);
+       wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
         wc->sec = (u32)wall_nsec;
         *wc_sec_hi = wall_nsec >> 32;
         smp_wmb();
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile

index a3bb36fb3cfc55a423d31c10e86c1fe16b79ca0e..fb01c3f8d3da2a26c515ab8a1c3d0a86b54d0032 100644 (file)
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
  TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
  TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
  TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
+TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c

new file mode 100644 (file)

index 0000000..df351ae
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit)
+{
+       const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8);
+       const uint64_t valid = BIT_ULL(18) | BIT_ULL(24);
+       const uint64_t legal = ignored | valid;
+       uint64_t val = BIT_ULL(bit);
+       uint64_t actual;
+       int r;
+
+       r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val);
+       TEST_ASSERT(val & ~legal ? !r : r == 1,
+                   "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s",
+                   val, val & ~legal ? "fail" : "succeed");
+
+       actual = vcpu_get_msr(vcpu, MSR_K7_HWCR);
+       TEST_ASSERT(actual == (val & valid),
+                   "Bit %u: unexpected HWCR 0x%lx; expected 0x%lx",
+                   bit, actual, (val & valid));
+
+       vcpu_set_msr(vcpu, MSR_K7_HWCR, 0);
+}
+
+int main(int argc, char *argv[])
+{
+       struct kvm_vm *vm;
+       struct kvm_vcpu *vcpu;
+       unsigned int bit;
+
+       vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+       for (bit = 0; bit < BITS_PER_LONG; bit++)
+               test_hwcr_bit(vcpu, bit);
+
+       kvm_vm_free(vm);
+}
author	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 31 Oct 2023 14:15:15 +0000 (10:15 -0400)
arch/x86/include/asm/cpufeatures.h		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/msr-index.h		patch \| blob \| history
arch/x86/kvm/Kconfig		patch \| blob \| history
arch/x86/kvm/cpuid.c		patch \| blob \| history
arch/x86/kvm/cpuid.h		patch \| blob \| history
arch/x86/kvm/hyperv.c		patch \| blob \| history
arch/x86/kvm/smm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history
arch/x86/kvm/xen.c		patch \| blob \| history
tools/testing/selftests/kvm/Makefile		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c	[new file with mode: 0644]	patch \| blob