Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 Jun 2022 16:16:31 +0000 (09:16 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 Jun 2022 16:16:31 +0000 (09:16 -0700)
Pull KVM fixes from Paolo Bonzini:

 - syzkaller NULL pointer dereference

 - TDP MMU performance issue with disabling dirty logging

 - 5.14 regression with SVM TSC scaling

 - indefinite stall on applying live patches

 - unstable selftest

 - memory leak from wrong copy-and-paste

 - missed PV TLB flush when racing with emulation

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86: do not report a vCPU as preempted outside instruction boundaries
  KVM: x86: do not set st->preempted when going back to user space
  KVM: SVM: fix tsc scaling cache logic
  KVM: selftests: Make hyperv_clock selftest more stable
  KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging
  x86: drop bogus "cc" clobber from __try_cmpxchg_user_asm()
  KVM: x86/mmu: Check every prev_roots in __kvm_mmu_free_obsolete_roots()
  entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set
  KVM: Don't null dereference ops->destroy

15 files changed:
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/uaccess.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/tdp_iter.c
arch/x86/kvm/mmu/tdp_iter.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/xen.h
kernel/entry/kvm.c
tools/testing/selftests/kvm/x86_64/hyperv_clock.c
virt/kvm/kvm_main.c

index 959d66b9be94d0230445faa642fc41e380bf5552..3a240a64ac68f4274f093b99b54f4a8914f9372d 100644 (file)
@@ -653,6 +653,7 @@ struct kvm_vcpu_arch {
        u64 ia32_misc_enable_msr;
        u64 smbase;
        u64 smi_count;
+       bool at_instruction_boundary;
        bool tpr_access_reporting;
        bool xsaves_enabled;
        bool xfd_no_write_intercept;
@@ -1300,6 +1301,8 @@ struct kvm_vcpu_stat {
        u64 nested_run;
        u64 directed_yield_attempted;
        u64 directed_yield_successful;
+       u64 preemption_reported;
+       u64 preemption_other;
        u64 guest_mode;
 };
 
index 35f222aa66bfcf39fa5824350fa93d4ff79ed22d..913e593a3b45fb5de1e16170a23320bb464a0ee6 100644 (file)
@@ -439,7 +439,7 @@ do {                                                                        \
                       [ptr] "+m" (*_ptr),                              \
                       [old] "+a" (__old)                               \
                     : [new] ltype (__new)                              \
-                    : "memory", "cc");                                 \
+                    : "memory");                                       \
        if (unlikely(__err))                                            \
                goto label;                                             \
        if (unlikely(!success))                                         \
index f4653688fa6db1d4bf0246e1f16188bf11612462..e826ee9138fa895dcc48ce5b7d784dd202c0c4bd 100644 (file)
@@ -5179,7 +5179,7 @@ static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
                roots_to_free |= KVM_MMU_ROOT_CURRENT;
 
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
-               if (is_obsolete_root(kvm, mmu->root.hpa))
+               if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
                        roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
        }
 
index 6d3b3e5a5533b174ab30a5cc4c7ef12f3d26fb72..ee4802d7b36cd308c83c6966db67993613c1d5fc 100644 (file)
@@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
        return true;
 }
 
+/*
+ * Step the iterator back up a level in the paging structure. Should only be
+ * used when the iterator is below the root level.
+ */
+void tdp_iter_step_up(struct tdp_iter *iter)
+{
+       WARN_ON(!try_step_up(iter));
+}
+
 /*
  * Step to the next SPTE in a pre-order traversal of the paging structure.
  * To get to the next SPTE, the iterator either steps down towards the goal
index f0af385c56e035e74ce9e0d01ed2f0d2ce4c5c20..adfca0cf94d3a0cb666a6e319817474f04b32362 100644 (file)
@@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
                    int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
 void tdp_iter_restart(struct tdp_iter *iter);
+void tdp_iter_step_up(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
index 841feaa48be5eb6425bc3c26a0af19210bfe1266..7b9265d6713101f3c1808a7eb84633a4267bbdb5 100644 (file)
@@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
        gfn_t start = slot->base_gfn;
        gfn_t end = start + slot->npages;
        struct tdp_iter iter;
+       int max_mapping_level;
        kvm_pfn_t pfn;
 
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
-retry:
                if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
@@ -1755,15 +1755,41 @@ retry:
                    !is_last_spte(iter.old_spte, iter.level))
                        continue;
 
+               /*
+                * This is a leaf SPTE. Check if the PFN it maps can
+                * be mapped at a higher level.
+                */
                pfn = spte_to_pfn(iter.old_spte);
-               if (kvm_is_reserved_pfn(pfn) ||
-                   iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
-                                                           pfn, PG_LEVEL_NUM))
+
+               if (kvm_is_reserved_pfn(pfn))
                        continue;
 
+               max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+                               iter.gfn, pfn, PG_LEVEL_NUM);
+
+               WARN_ON(max_mapping_level < iter.level);
+
+               /*
+                * If this page is already mapped at the highest
+                * viable level, there's nothing more to do.
+                */
+               if (max_mapping_level == iter.level)
+                       continue;
+
+               /*
+                * The page can be remapped at a higher level, so step
+                * up to zap the parent SPTE.
+                */
+               while (max_mapping_level > iter.level)
+                       tdp_iter_step_up(&iter);
+
                /* Note, a successful atomic zap also does a remote TLB flush. */
-               if (tdp_mmu_zap_spte_atomic(kvm, &iter))
-                       goto retry;
+               tdp_mmu_zap_spte_atomic(kvm, &iter);
+
+               /*
+                * If the atomic zap fails, the iter will recurse back into
+                * the same subtree to retry.
+                */
        }
 
        rcu_read_unlock();
index bed5e1692cef0209457697cd96dc401139a6167f..3361258640a27e15f3ab04f46e6daedbfb09079a 100644 (file)
@@ -982,7 +982,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
                WARN_ON(!svm->tsc_scaling_enabled);
                vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
-               svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+               __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
        }
 
        svm->nested.ctl.nested_cr3 = 0;
@@ -1387,7 +1387,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
        vcpu->arch.tsc_scaling_ratio =
                kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
                                               svm->tsc_ratio_msr);
-       svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+       __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
 }
 
 /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
index 200045f71df04e522e883e34756a71eda29f5cf5..1dc02cdf69602e19934f9ac08f7cdcc2ce7c25ee 100644 (file)
@@ -465,11 +465,24 @@ static int has_svm(void)
        return 1;
 }
 
+void __svm_write_tsc_multiplier(u64 multiplier)
+{
+       preempt_disable();
+
+       if (multiplier == __this_cpu_read(current_tsc_ratio))
+               goto out;
+
+       wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+       __this_cpu_write(current_tsc_ratio, multiplier);
+out:
+       preempt_enable();
+}
+
 static void svm_hardware_disable(void)
 {
        /* Make sure we clean up behind us */
        if (tsc_scaling)
-               wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
+               __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 
        cpu_svm_disable();
 
@@ -515,8 +528,7 @@ static int svm_hardware_enable(void)
                 * Set the default value, even if we don't use TSC scaling
                 * to avoid having stale value in the msr
                 */
-               wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
-               __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT);
+               __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
        }
 
 
@@ -999,11 +1011,12 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
-void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
 {
-       wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+       __svm_write_tsc_multiplier(multiplier);
 }
 
+
 /* Evaluate instruction intercepts that depend on guest CPUID features. */
 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
                                              struct vcpu_svm *svm)
@@ -1363,13 +1376,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
                sev_es_prepare_switch_to_guest(hostsa);
        }
 
-       if (tsc_scaling) {
-               u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
-               if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
-                       __this_cpu_write(current_tsc_ratio, tsc_ratio);
-                       wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
-               }
-       }
+       if (tsc_scaling)
+               __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
 
        if (likely(tsc_aux_uret_slot >= 0))
                kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
@@ -4255,6 +4263,8 @@ out:
 
 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
+       if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+               vcpu->arch.at_instruction_boundary = true;
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
index 21c5460e947aaf5b54e32cdfd1a602336bce47cf..500348c1cb350871b0d996d3154a1a509662ad70 100644 (file)
@@ -590,7 +590,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                               bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
 void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
-void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
+void __svm_write_tsc_multiplier(u64 multiplier);
 void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
                                       struct vmcb_control_area *control);
 void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
index a07e8cd753ec55660f9142f283a7be81926b2d7d..9bd86ecccdab56d455fdcc30e7fab6039b127eee 100644 (file)
@@ -6547,6 +6547,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
                return;
 
        handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+       vcpu->arch.at_instruction_boundary = true;
 }
 
 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
index e9473c7c73903a4fba783cb2c98bf04bdd63c1d7..03fbfbbec460fab308dc3a490bf20c22d119ba39 100644 (file)
@@ -296,6 +296,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        STATS_DESC_COUNTER(VCPU, nested_run),
        STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
        STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+       STATS_DESC_COUNTER(VCPU, preemption_reported),
+       STATS_DESC_COUNTER(VCPU, preemption_other),
        STATS_DESC_ICOUNTER(VCPU, guest_mode)
 };
 
@@ -4625,6 +4627,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        struct kvm_memslots *slots;
        static const u8 preempted = KVM_VCPU_PREEMPTED;
 
+       /*
+        * The vCPU can be marked preempted if and only if the VM-Exit was on
+        * an instruction boundary and will not trigger guest emulation of any
+        * kind (see vcpu_run).  Vendor specific code controls (conservatively)
+        * when this is true, for example allowing the vCPU to be marked
+        * preempted if and only if the VM-Exit was due to a host interrupt.
+        */
+       if (!vcpu->arch.at_instruction_boundary) {
+               vcpu->stat.preemption_other++;
+               return;
+       }
+
+       vcpu->stat.preemption_reported++;
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
 
@@ -4654,19 +4669,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        int idx;
 
-       if (vcpu->preempted && !vcpu->arch.guest_state_protected)
-               vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+       if (vcpu->preempted) {
+               if (!vcpu->arch.guest_state_protected)
+                       vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
 
-       /*
-        * Take the srcu lock as memslots will be accessed to check the gfn
-        * cache generation against the memslots generation.
-        */
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (kvm_xen_msr_enabled(vcpu->kvm))
-               kvm_xen_runstate_set_preempted(vcpu);
-       else
-               kvm_steal_time_set_preempted(vcpu);
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+               /*
+                * Take the srcu lock as memslots will be accessed to check the gfn
+                * cache generation against the memslots generation.
+                */
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               if (kvm_xen_msr_enabled(vcpu->kvm))
+                       kvm_xen_runstate_set_preempted(vcpu);
+               else
+                       kvm_steal_time_set_preempted(vcpu);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
+       }
 
        static_call(kvm_x86_vcpu_put)(vcpu);
        vcpu->arch.last_host_tsc = rdtsc();
@@ -10422,6 +10439,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
        vcpu->arch.l1tf_flush_l1d = true;
 
        for (;;) {
+               /*
+                * If another guest vCPU requests a PV TLB flush in the middle
+                * of instruction emulation, the rest of the emulation could
+                * use a stale page translation. Assume that any code after
+                * this point can start executing an instruction.
+                */
+               vcpu->arch.at_instruction_boundary = false;
                if (kvm_vcpu_running(vcpu)) {
                        r = vcpu_enter_guest(vcpu);
                } else {
index ee5c4ae0755cddeec1db69e951bb14f28fc42c65..532a535a9e99f43c4cfadc1fbbb1192630225bf8 100644 (file)
@@ -159,8 +159,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
         * behalf of the vCPU. Only if the VMM does actually block
         * does it need to enter RUNSTATE_blocked.
         */
-       if (vcpu->preempted)
-               kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+       if (WARN_ON_ONCE(!vcpu->preempted))
+               return;
+
+       kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
 }
 
 /* 32-bit compatibility definitions, also used natively in 32-bit build */
index 9d09f489b60e0c9872466a0ae6bf0a9fc3c7e23a..2e0f75bcb7fd1c21084df8ff885aeec001a359ea 100644 (file)
@@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
                int ret;
 
                if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
-                       clear_notify_signal();
-                       if (task_work_pending(current))
-                               task_work_run();
-               }
-
-               if (ti_work & _TIF_SIGPENDING) {
                        kvm_handle_signal_exit(vcpu);
                        return -EINTR;
                }
index e0b2bb1339b162cc1b6bd20deef8d158c44c109c..3330fb183c6800e346a4278555bc764f40292939 100644 (file)
@@ -44,7 +44,7 @@ static inline void nop_loop(void)
 {
        int i;
 
-       for (i = 0; i < 1000000; i++)
+       for (i = 0; i < 100000000; i++)
                asm volatile("nop");
 }
 
@@ -56,12 +56,14 @@ static inline void check_tsc_msr_rdtsc(void)
        tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
        GUEST_ASSERT(tsc_freq > 0);
 
-       /* First, check MSR-based clocksource */
+       /* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
        r1 = rdtsc();
        t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+       r1 = (r1 + rdtsc()) / 2;
        nop_loop();
        r2 = rdtsc();
        t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+       r2 = (r2 + rdtsc()) / 2;
 
        GUEST_ASSERT(r2 > r1 && t2 > t1);
 
@@ -181,12 +183,14 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm)
        tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY);
        TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
 
-       /* First, check MSR-based clocksource */
+       /* For increased accuracy, take mean rdtsc() before and afrer ioctl */
        r1 = rdtsc();
        t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
+       r1 = (r1 + rdtsc()) / 2;
        nop_loop();
        r2 = rdtsc();
        t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
+       r2 = (r2 + rdtsc()) / 2;
 
        TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
 
index 64ec2222a1968eb237e45270dd0494632862a637..44c47670447aa4cc250f255ae9e5ec31ad1df377 100644 (file)
@@ -4300,8 +4300,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
                kvm_put_kvm_no_destroy(kvm);
                mutex_lock(&kvm->lock);
                list_del(&dev->vm_node);
+               if (ops->release)
+                       ops->release(dev);
                mutex_unlock(&kvm->lock);
-               ops->destroy(dev);
+               if (ops->destroy)
+                       ops->destroy(dev);
                return ret;
        }