Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Feb 2021 18:00:12 +0000 (10:00 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Feb 2021 18:00:12 +0000 (10:00 -0800)
Pull more KVM updates from Paolo Bonzini:
 "x86:

   - take into account HVA before retrying on MMU notifier race

   - fixes for nested AMD guests without NPT

   - allow INVPCID in guest without PCID

   - disable PML in hardware when not in use

   - MMU code cleanups:

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  KVM: SVM: Fix nested VM-Exit on #GP interception handling
  KVM: vmx/pmu: Fix dummy check if lbr_desc->event is created
  KVM: x86/mmu: Consider the hva in mmu_notifier retry
  KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault
  KVM: Documentation: rectify rst markup in KVM_GET_SUPPORTED_HV_CPUID
  KVM: nSVM: prepare guest save area while is_guest_mode is true
  KVM: x86/mmu: Remove a variety of unnecessary exports
  KVM: x86: Fold "write-protect large" use case into generic write-protect
  KVM: x86/mmu: Don't set dirty bits when disabling dirty logging w/ PML
  KVM: VMX: Dynamically enable/disable PML based on memslot dirty logging
  KVM: x86: Further clarify the logic and comments for toggling log dirty
  KVM: x86: Move MMU's PML logic to common code
  KVM: x86/mmu: Make dirty log size hook (PML) a value, not a function
  KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect()
  KVM: nVMX: Disable PML in hardware when running L2
  KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs
  KVM: x86/mmu: Pass the memslot to the rmap callbacks
  KVM: x86/mmu: Split out max mapping level calculation to helper
  KVM: x86/mmu: Expand collapsible SPTE zap for TDP MMU to ZONE_DEVICE and HugeTLB pages
  KVM: nVMX: no need to undo inject_page_fault change on nested vmexit
  ...

24 files changed:
Documentation/virt/kvm/api.rst
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c
include/linux/kvm_host.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/hardware_disable_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/x86_64/processor.c
virt/kvm/kvm_main.c

index 45fd862ac128a2ccfdfc9006c44ec6fb1998dea6..aed52b0fc16ec3ac48ba407ebfc67c4213f6c2fb 100644 (file)
@@ -4519,6 +4519,7 @@ KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature
 leaves (0x40000000, 0x40000001).
 
 Currently, the following list of CPUID leaves are returned:
+
  - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
  - HYPERV_CPUID_INTERFACE
  - HYPERV_CPUID_VERSION
@@ -4543,6 +4544,7 @@ userspace should not expect to get any particular value there.
 Note, vcpu version of KVM_GET_SUPPORTED_HV_CPUID is currently deprecated. Unlike
 system ioctl which exposes all supported feature bits unconditionally, vcpu
 version has the following quirks:
+
 - HYPERV_CPUID_NESTED_FEATURES leaf and HV_X64_ENLIGHTENED_VMCS_RECOMMENDED
   feature bit are only exposed when Enlightened VMCS was previously enabled
   on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
index c77f2d4f44ca6d68abdd0c546f6b1dc89293b988..bb6773594cf8280fc275a6269d58f4da0070e689 100644 (file)
@@ -591,7 +591,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
        } else {
                /* Call KVM generic code to do the slow-path check */
                pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-                                          writing, &write_ok);
+                                          writing, &write_ok, NULL);
                if (is_error_noslot_pfn(pfn))
                        return -EFAULT;
                page = NULL;
index bb35490400e994f886acf45de5b3432012f4bd86..e603de7ade52ef301d1e04c3574ae8aef6515f90 100644 (file)
@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 
                /* Call KVM generic code to do the slow-path check */
                pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-                                          writing, upgrade_p);
+                                          writing, upgrade_p, NULL);
                if (is_error_noslot_pfn(pfn))
                        return -EFAULT;
                page = NULL;
index 355a2ab8fc090ed6da86d3dd3ae32d4d8242bbb0..323641097f63aa697bcc6aefb621252cce61849c 100644 (file)
@@ -93,11 +93,7 @@ KVM_X86_OP(check_intercept)
 KVM_X86_OP(handle_exit_irqoff)
 KVM_X86_OP_NULL(request_immediate_exit)
 KVM_X86_OP(sched_in)
-KVM_X86_OP_NULL(slot_enable_log_dirty)
-KVM_X86_OP_NULL(slot_disable_log_dirty)
-KVM_X86_OP_NULL(flush_log_dirty)
-KVM_X86_OP_NULL(enable_log_dirty_pt_masked)
-KVM_X86_OP_NULL(cpu_dirty_log_size)
+KVM_X86_OP_NULL(update_cpu_dirty_logging)
 KVM_X86_OP_NULL(pre_block)
 KVM_X86_OP_NULL(post_block)
 KVM_X86_OP_NULL(vcpu_blocking)
index 84499aad01a468820a961568ee58682d6726938b..0cf71ff2b2e5f6763c15742af685a83f82c1ebab 100644 (file)
@@ -89,6 +89,8 @@
        KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_APF_READY              KVM_ARCH_REQ(28)
 #define KVM_REQ_MSR_FILTER_CHANGED     KVM_ARCH_REQ(29)
+#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
+       KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 #define CR0_RESERVED_BITS                                               \
        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1007,6 +1009,7 @@ struct kvm_arch {
        u32 bsp_vcpu_id;
 
        u64 disabled_quirks;
+       int cpu_dirty_logging_count;
 
        enum kvm_irqchip_mode irqchip_mode;
        u8 nr_reserved_ioapic_pins;
@@ -1271,30 +1274,11 @@ struct kvm_x86_ops {
        void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
 
        /*
-        * Arch-specific dirty logging hooks. These hooks are only supposed to
-        * be valid if the specific arch has hardware-accelerated dirty logging
-        * mechanism. Currently only for PML on VMX.
-        *
-        *  - slot_enable_log_dirty:
-        *      called when enabling log dirty mode for the slot.
-        *  - slot_disable_log_dirty:
-        *      called when disabling log dirty mode for the slot.
-        *      also called when slot is created with log dirty disabled.
-        *  - flush_log_dirty:
-        *      called before reporting dirty_bitmap to userspace.
-        *  - enable_log_dirty_pt_masked:
-        *      called when reenabling log dirty for the GFNs in the mask after
-        *      corresponding bits are cleared in slot->dirty_bitmap.
+        * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
+        * value indicates CPU dirty logging is unsupported or disabled.
         */
-       void (*slot_enable_log_dirty)(struct kvm *kvm,
-                                     struct kvm_memory_slot *slot);
-       void (*slot_disable_log_dirty)(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot);
-       void (*flush_log_dirty)(struct kvm *kvm);
-       void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
-                                          struct kvm_memory_slot *slot,
-                                          gfn_t offset, unsigned long mask);
-       int (*cpu_dirty_log_size)(void);
+       int cpu_dirty_log_size;
+       void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
 
        /* pmu operations of sub-arch */
        const struct kvm_pmu_ops *pmu_ops;
@@ -1437,11 +1421,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
                                        struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_set_dirty(struct kvm *kvm,
-                           struct kvm_memory_slot *memslot);
-void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
-                                  struct kvm_memory_slot *slot,
-                                  gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1613,7 +1592,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 void kvm_update_dr7(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
index c8f2592ccc999780c4c97e7a0fa8c391d7e2466e..6bd2f8b830e49ff115ea9058bb6a2c51ebcdfaf5 100644 (file)
@@ -408,7 +408,7 @@ void kvm_set_cpu_caps(void)
 
        kvm_cpu_cap_mask(CPUID_7_0_EBX,
                F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-               F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+               F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
                F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
                F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
                F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
index e507568cd55d69d9ab50b6ed3f193f26f8e01ce5..d75524bc84234ecea48b0b6606238fead77c6e04 100644 (file)
@@ -1165,7 +1165,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
  *     - W bit on ad-disabled SPTEs.
  * Returns true iff any D or W bits were cleared.
  */
-static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1180,35 +1181,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
        return flush;
 }
 
-static bool spte_set_dirty(u64 *sptep)
-{
-       u64 spte = *sptep;
-
-       rmap_printk("spte %p %llx\n", sptep, *sptep);
-
-       /*
-        * Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
-        * do not bother adding back write access to pages marked
-        * SPTE_AD_WRPROT_ONLY_MASK.
-        */
-       spte |= shadow_dirty_mask;
-
-       return mmu_spte_update(sptep, spte);
-}
-
-static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
-{
-       u64 *sptep;
-       struct rmap_iterator iter;
-       bool flush = false;
-
-       for_each_rmap_spte(rmap_head, &iter, sptep)
-               if (spte_ad_enabled(*sptep))
-                       flush |= spte_set_dirty(sptep);
-
-       return flush;
-}
-
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -1248,9 +1220,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
  *
  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
  */
-void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
-                                    struct kvm_memory_slot *slot,
-                                    gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+                                        struct kvm_memory_slot *slot,
+                                        gfn_t gfn_offset, unsigned long mask)
 {
        struct kvm_rmap_head *rmap_head;
 
@@ -1260,13 +1232,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
        while (mask) {
                rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
                                          PG_LEVEL_4K, slot);
-               __rmap_clear_dirty(kvm, rmap_head);
+               __rmap_clear_dirty(kvm, rmap_head, slot);
 
                /* clear the first set bit */
                mask &= mask - 1;
        }
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
 
 /**
  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
@@ -1282,20 +1253,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                struct kvm_memory_slot *slot,
                                gfn_t gfn_offset, unsigned long mask)
 {
-       if (kvm_x86_ops.enable_log_dirty_pt_masked)
-               static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot,
-                                                               gfn_offset,
-                                                               mask);
+       if (kvm_x86_ops.cpu_dirty_log_size)
+               kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
        else
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
 int kvm_cpu_dirty_log_size(void)
 {
-       if (kvm_x86_ops.cpu_dirty_log_size)
-               return static_call(kvm_x86_cpu_dirty_log_size)();
-
-       return 0;
+       return kvm_x86_ops.cpu_dirty_log_size;
 }
 
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
@@ -1325,7 +1291,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
        return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
 }
 
-static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                         struct kvm_memory_slot *slot)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1345,7 +1312,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
                           unsigned long data)
 {
-       return kvm_zap_rmapp(kvm, rmap_head);
+       return kvm_zap_rmapp(kvm, rmap_head, slot);
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -2499,7 +2466,21 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
        return r;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
+
+static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa;
+       int r;
+
+       if (vcpu->arch.mmu->direct_map)
+               return 0;
+
+       gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+       r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+       return r;
+}
 
 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -2753,11 +2734,18 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
        if (sp->role.level > PG_LEVEL_4K)
                return;
 
+       /*
+        * If addresses are being invalidated, skip prefetching to avoid
+        * accidentally prefetching those addresses.
+        */
+       if (unlikely(vcpu->kvm->mmu_notifier_count))
+               return;
+
        __direct_pte_prefetch(vcpu, sp, sptep);
 }
 
-static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                 kvm_pfn_t pfn, struct kvm_memory_slot *slot)
+static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+                                 struct kvm_memory_slot *slot)
 {
        unsigned long hva;
        pte_t *pte;
@@ -2776,19 +2764,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
         */
        hva = __gfn_to_hva_memslot(slot, gfn);
 
-       pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
+       pte = lookup_address_in_mm(kvm->mm, hva, &level);
        if (unlikely(!pte))
                return PG_LEVEL_4K;
 
        return level;
 }
 
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
+                             gfn_t gfn, kvm_pfn_t pfn, int max_level)
+{
+       struct kvm_lpage_info *linfo;
+
+       max_level = min(max_level, max_huge_page_level);
+       for ( ; max_level > PG_LEVEL_4K; max_level--) {
+               linfo = lpage_info_slot(gfn, slot, max_level);
+               if (!linfo->disallow_lpage)
+                       break;
+       }
+
+       if (max_level == PG_LEVEL_4K)
+               return PG_LEVEL_4K;
+
+       return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+}
+
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
                            int max_level, kvm_pfn_t *pfnp,
                            bool huge_page_disallowed, int *req_level)
 {
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        kvm_pfn_t pfn = *pfnp;
        kvm_pfn_t mask;
        int level;
@@ -2805,17 +2810,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
        if (!slot)
                return PG_LEVEL_4K;
 
-       max_level = min(max_level, max_huge_page_level);
-       for ( ; max_level > PG_LEVEL_4K; max_level--) {
-               linfo = lpage_info_slot(gfn, slot, max_level);
-               if (!linfo->disallow_lpage)
-                       break;
-       }
-
-       if (max_level == PG_LEVEL_4K)
-               return PG_LEVEL_4K;
-
-       level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
+       level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
        if (level == PG_LEVEL_4K)
                return level;
 
@@ -3437,7 +3432,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
        write_unlock(&vcpu->kvm->mmu_lock);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                  u32 access, struct x86_exception *exception)
@@ -3653,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
-                        bool *writable)
+                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+                        bool write, bool *writable)
 {
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        bool async;
@@ -3667,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
        }
 
        async = false;
-       *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+       *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+                                   write, writable, hva);
        if (!async)
                return false; /* *pfn has correct page already */
 
@@ -3681,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                        return true;
        }
 
-       *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+       *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+                                   write, writable, hva);
        return false;
 }
 
@@ -3694,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        gfn_t gfn = gpa >> PAGE_SHIFT;
        unsigned long mmu_seq;
        kvm_pfn_t pfn;
+       hva_t hva;
        int r;
 
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3712,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
 
-       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+                        write, &map_writable))
                return RET_PF_RETRY;
 
        if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
@@ -3725,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        else
                write_lock(&vcpu->kvm->mmu_lock);
 
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+       if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
                goto out_unlock;
        r = make_mmu_pages_available(vcpu);
        if (r)
@@ -5003,22 +5001,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        write_unlock(&vcpu->kvm->mmu_lock);
 }
 
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       gpa_t gpa;
-       int r;
-
-       if (vcpu->arch.mmu->direct_map)
-               return 0;
-
-       gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
-       r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
-       return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                       void *insn, int insn_len)
 {
@@ -5117,7 +5099,6 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                mmu->invlpg(vcpu, gva, root_hpa);
        }
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
@@ -5157,7 +5138,6 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
         * for them.
         */
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
 
 void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
                       int tdp_huge_page_level)
@@ -5182,7 +5162,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
 
 /* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                                   struct kvm_memory_slot *slot);
 
 /* The caller should hold mmu-lock before calling this function. */
 static __always_inline bool
@@ -5196,7 +5177,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
        for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
                        end_gfn, &iterator) {
                if (iterator.rmap)
-                       flush |= fn(kvm, iterator.rmap);
+                       flush |= fn(kvm, iterator.rmap, memslot);
 
                if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
                        if (flush && lock_flush_tlb) {
@@ -5229,22 +5210,6 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        lock_flush_tlb);
 }
 
-static __always_inline bool
-slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                     slot_level_handler fn, bool lock_flush_tlb)
-{
-       return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
-                                KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
-
-static __always_inline bool
-slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       slot_level_handler fn, bool lock_flush_tlb)
-{
-       return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
-                                KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
-
 static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
                 slot_level_handler fn, bool lock_flush_tlb)
@@ -5485,7 +5450,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
-                                   struct kvm_rmap_head *rmap_head)
+                                   struct kvm_rmap_head *rmap_head,
+                                   struct kvm_memory_slot *slot)
 {
        return __rmap_write_protect(kvm, rmap_head, false);
 }
@@ -5519,7 +5485,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 }
 
 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
-                                        struct kvm_rmap_head *rmap_head)
+                                        struct kvm_rmap_head *rmap_head,
+                                        struct kvm_memory_slot *slot)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -5540,8 +5507,8 @@ restart:
                 * mapping if the indirect sp has level = 1.
                 */
                if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
-                   (kvm_is_zone_device_pfn(pfn) ||
-                    PageCompound(pfn_to_page(pfn)))) {
+                   sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
+                                                              pfn, PG_LEVEL_NUM)) {
                        pte_list_remove(rmap_head, sptep);
 
                        if (kvm_available_flush_tlb_with_range())
@@ -5561,12 +5528,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot)
 {
        /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
+       struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+
        write_lock(&kvm->mmu_lock);
-       slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
-                        kvm_mmu_zap_collapsible_spte, true);
+       slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 
        if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
+               kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
        write_unlock(&kvm->mmu_lock);
 }
 
@@ -5605,40 +5573,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
-
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
-{
-       bool flush;
-
-       write_lock(&kvm->mmu_lock);
-       flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
-                                       false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
-       write_unlock(&kvm->mmu_lock);
-
-       if (flush)
-               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
-
-void kvm_mmu_slot_set_dirty(struct kvm *kvm,
-                           struct kvm_memory_slot *memslot)
-{
-       bool flush;
-
-       write_lock(&kvm->mmu_lock);
-       flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
-       write_unlock(&kvm->mmu_lock);
-
-       if (flush)
-               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
index 9e38d3c5daad8c0a11f06322e1bf4248d1865d38..72b0928f2b2d96c54cb2d75bbff84494c150b627 100644 (file)
@@ -84,7 +84,10 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
         * When using the EPT page-modification log, the GPAs in the log
         * would come from L2 rather than L1.  Therefore, we need to rely
         * on write protection to record dirty pages.  This also bypasses
-        * PML, since writes now result in a vmexit.
+        * PML, since writes now result in a vmexit.  Note, this helper will
+        * tag SPTEs as needing write-protection even if PML is disabled or
+        * unsupported, but that's ok because the tag is consumed if and only
+        * if PML is enabled.  Omit the PML check to save a few uops.
         */
        return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
 }
@@ -138,6 +141,8 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
 #define SET_SPTE_SPURIOUS              BIT(2)
 
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
+                             gfn_t gfn, kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
                            int max_level, kvm_pfn_t *pfnp,
                            bool huge_page_disallowed, int *req_level);
index d9f66cc459e84993b08752529d45c514f0435e8d..55d7b473ac447de3189071b89b89f823605a69b0 100644 (file)
@@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
        if (sp->role.level > PG_LEVEL_4K)
                return;
 
+       /*
+        * If addresses are being invalidated, skip prefetching to avoid
+        * accidentally prefetching those addresses.
+        */
+       if (unlikely(vcpu->kvm->mmu_notifier_count))
+               return;
+
        if (sp->role.direct)
                return __direct_pte_prefetch(vcpu, sp, sptep);
 
@@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
        struct guest_walker walker;
        int r;
        kvm_pfn_t pfn;
+       hva_t hva;
        unsigned long mmu_seq;
        bool map_writable, is_self_change_mapping;
        int max_level;
@@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
 
-       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
-                        &map_writable))
+       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+                        write_fault, &map_writable))
                return RET_PF_RETRY;
 
        if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
@@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 
        r = RET_PF_RETRY;
        write_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+       if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
                goto out_unlock;
 
        kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
index 71e100a5670f91bc37126438929cf9fd36a15071..c926c6b899a106c704a68b25997209aff05ff3b1 100644 (file)
@@ -1268,68 +1268,16 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
        }
 }
 
-/*
- * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
- * only used for PML, and so will involve setting the dirty bit on each SPTE.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
- */
-static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                               gfn_t start, gfn_t end)
-{
-       struct tdp_iter iter;
-       u64 new_spte;
-       bool spte_set = false;
-
-       rcu_read_lock();
-
-       tdp_root_for_each_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
-                       continue;
-
-               if (!is_shadow_present_pte(iter.old_spte) ||
-                   iter.old_spte & shadow_dirty_mask)
-                       continue;
-
-               new_spte = iter.old_spte | shadow_dirty_mask;
-
-               tdp_mmu_set_spte(kvm, &iter, new_spte);
-               spte_set = true;
-       }
-
-       rcu_read_unlock();
-       return spte_set;
-}
-
-/*
- * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
- * only used for PML, and so will involve setting the dirty bit on each SPTE.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
- */
-bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-       struct kvm_mmu_page *root;
-       int root_as_id;
-       bool spte_set = false;
-
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
-               spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
-                               slot->base_gfn + slot->npages);
-       }
-       return spte_set;
-}
-
 /*
  * Clear leaf entries which could be replaced by large mappings, for
  * GFNs within the slot.
  */
 static void zap_collapsible_spte_range(struct kvm *kvm,
                                       struct kvm_mmu_page *root,
-                                      gfn_t start, gfn_t end)
+                                      struct kvm_memory_slot *slot)
 {
+       gfn_t start = slot->base_gfn;
+       gfn_t end = start + slot->npages;
        struct tdp_iter iter;
        kvm_pfn_t pfn;
        bool spte_set = false;
@@ -1348,7 +1296,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 
                pfn = spte_to_pfn(iter.old_spte);
                if (kvm_is_reserved_pfn(pfn) ||
-                   !PageTransCompoundMap(pfn_to_page(pfn)))
+                   iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
+                                                           pfn, PG_LEVEL_NUM))
                        continue;
 
                tdp_mmu_set_spte(kvm, &iter, 0);
@@ -1366,7 +1315,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
  * be replaced by large mappings, for GFNs within the slot.
  */
 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      const struct kvm_memory_slot *slot)
+                                      struct kvm_memory_slot *slot)
 {
        struct kvm_mmu_page *root;
        int root_as_id;
@@ -1376,8 +1325,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
                if (root_as_id != slot->as_id)
                        continue;
 
-               zap_collapsible_spte_range(kvm, root, slot->base_gfn,
-                                          slot->base_gfn + slot->npages);
+               zap_collapsible_spte_range(kvm, root, slot);
        }
 }
 
index b4b65e3699b36dcfe1d6f6ea2990f83a33c93df1..3b761c111bff13bf9d09531ebeaf1b21962ed42a 100644 (file)
@@ -33,9 +33,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       struct kvm_memory_slot *slot,
                                       gfn_t gfn, unsigned long mask,
                                       bool wrprot);
-bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      const struct kvm_memory_slot *slot);
+                                      struct kvm_memory_slot *slot);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn);
index cc91738ab445cf8fd7d759458ba33d21eba03727..35891d9a1099b3f54d146e0abb26b89c74075b2e 100644 (file)
@@ -51,6 +51,23 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
        nested_svm_vmexit(svm);
 }
 
+static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       WARN_ON(!is_guest_mode(vcpu));
+
+       if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
+          !svm->nested.nested_run_pending) {
+               svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+               svm->vmcb->control.exit_code_hi = 0;
+               svm->vmcb->control.exit_info_1 = fault->error_code;
+               svm->vmcb->control.exit_info_2 = fault->address;
+               nested_svm_vmexit(svm);
+       } else {
+               kvm_inject_page_fault(vcpu, fault);
+       }
+}
+
 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -436,16 +453,33 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
 {
        int ret;
 
+       trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
+                              vmcb12->save.rip,
+                              vmcb12->control.int_ctl,
+                              vmcb12->control.event_inj,
+                              vmcb12->control.nested_ctl);
+
+       trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+                                   vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+                                   vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+                                   vmcb12->control.intercepts[INTERCEPT_WORD3],
+                                   vmcb12->control.intercepts[INTERCEPT_WORD4],
+                                   vmcb12->control.intercepts[INTERCEPT_WORD5]);
+
+
        svm->nested.vmcb12_gpa = vmcb12_gpa;
        load_nested_vmcb_control(svm, &vmcb12->control);
-       nested_prepare_vmcb_save(svm, vmcb12);
        nested_prepare_vmcb_control(svm);
+       nested_prepare_vmcb_save(svm, vmcb12);
 
        ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                  nested_npt_enabled(svm));
        if (ret)
                return ret;
 
+       if (!npt_enabled)
+               svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+
        svm_set_gif(svm, true);
 
        return 0;
@@ -489,18 +523,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
                goto out;
        }
 
-       trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
-                              vmcb12->save.rip,
-                              vmcb12->control.int_ctl,
-                              vmcb12->control.event_inj,
-                              vmcb12->control.nested_ctl);
-
-       trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
-                                   vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
-                                   vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
-                                   vmcb12->control.intercepts[INTERCEPT_WORD3],
-                                   vmcb12->control.intercepts[INTERCEPT_WORD4],
-                                   vmcb12->control.intercepts[INTERCEPT_WORD5]);
 
        /* Clear internal status */
        kvm_clear_exception_queue(&svm->vcpu);
index adb3619a3c1696e9ece09971f5c4b77b34ebfe11..c636021b066b0b1cf4b9fc36e95c6ce9d0c80b4f 100644 (file)
@@ -926,9 +926,6 @@ static __init void svm_set_cpu_caps(void)
        if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
            boot_cpu_has(X86_FEATURE_AMD_SSBD))
                kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
-
-       /* Enable INVPCID feature */
-       kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
 }
 
 static __init int svm_hardware_setup(void)
@@ -1103,12 +1100,12 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 static void svm_check_invpcid(struct vcpu_svm *svm)
 {
        /*
-        * Intercept INVPCID instruction only if shadow page table is
-        * enabled. Interception is not required with nested page table
-        * enabled.
+        * Intercept INVPCID if shadow paging is enabled to sync/free shadow
+        * roots, or if INVPCID is disabled in the guest to inject #UD.
         */
        if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
-               if (!npt_enabled)
+               if (!npt_enabled ||
+                   !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
                        svm_set_intercept(svm, INTERCEPT_INVPCID);
                else
                        svm_clr_intercept(svm, INTERCEPT_INVPCID);
@@ -2214,15 +2211,20 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
                [SVM_INSTR_VMSAVE] = vmsave_interception,
        };
        struct vcpu_svm *svm = to_svm(vcpu);
+       int ret;
 
        if (is_guest_mode(vcpu)) {
                svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
                svm->vmcb->control.exit_info_1 = 0;
                svm->vmcb->control.exit_info_2 = 0;
 
-               return nested_svm_vmexit(svm);
-       } else
-               return svm_instr_handlers[opcode](svm);
+               /* Returns '1' or -errno on failure, '0' on success. */
+               ret = nested_svm_vmexit(svm);
+               if (ret)
+                       return ret;
+               return 1;
+       }
+       return svm_instr_handlers[opcode](svm);
 }
 
 /*
index b2f0b5e9cd638e99b7ed8b7b4459e8530558d8ba..bcca0b80e0d040be4ce9a4f9a2ee05d77e0c8257 100644 (file)
@@ -2167,15 +2167,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
                vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 
        /*
-        * The PML address never changes, so it is constant in vmcs02.
-        * Conceptually we want to copy the PML index from vmcs01 here,
-        * and then back to vmcs01 on nested vmexit.  But since we flush
-        * the log and reset GUEST_PML_INDEX on each vmexit, the PML
-        * index is also effectively constant in vmcs02.
+        * PML is emulated for L2, but never enabled in hardware as the MMU
+        * handles A/D emulation.  Disabling PML for L2 also avoids having to
+        * deal with filtering out L2 GPAs from the buffer.
         */
        if (enable_pml) {
-               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
-               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+               vmcs_write64(PML_ADDRESS, 0);
+               vmcs_write16(GUEST_PML_INDEX, -1);
        }
 
        if (cpu_has_vmx_encls_vmexit())
@@ -2210,7 +2208,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
 
 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 {
-       u32 exec_control, vmcs12_exec_ctrl;
+       u32 exec_control;
        u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
 
        if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
@@ -2284,11 +2282,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_ENABLE_VMFUNC);
                if (nested_cpu_has(vmcs12,
-                                  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
-                       vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
-                               ~SECONDARY_EXEC_ENABLE_PML;
-                       exec_control |= vmcs12_exec_ctrl;
-               }
+                                  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+                       exec_control |= vmcs12->secondary_vm_exec_control;
+
+               /* PML is emulated and never enabled in hardware for L2. */
+               exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
                /* VMCS shadowing for L2 is emulated for now */
                exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
@@ -4200,9 +4198,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
 
-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
        nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
 
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
@@ -4495,6 +4490,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
                vmx_set_virtual_apic_mode(vcpu);
        }
 
+       if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
+               vmx->nested.update_vmcs01_cpu_dirty_logging = false;
+               vmx_update_cpu_dirty_logging(vcpu);
+       }
+
        /* Unpin physical memory we referred to in vmcs02 */
        if (vmx->nested.apic_access_page) {
                kvm_release_page_clean(vmx->nested.apic_access_page);
@@ -5793,7 +5793,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_PREEMPTION_TIMER:
                return true;
        case EXIT_REASON_PML_FULL:
-               /* We emulate PML support to L1. */
+               /*
+                * PML is emulated for an L1 VMM and should never be enabled in
+                * vmcs02, always "handle" PML_FULL by exiting to userspace.
+                */
                return true;
        case EXIT_REASON_VMFUNC:
                /* VM functions are emulated through L2->L0 vmexits. */
index d1df618cb7deb47e535c5686b5860bc014405ab1..9efc1a6b86930ad5879faa487270c8ab459a2da6 100644 (file)
@@ -298,7 +298,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
        if (IS_ERR(event)) {
                pr_debug_ratelimited("%s: failed %ld\n",
                                        __func__, PTR_ERR(event));
-               return -ENOENT;
+               return PTR_ERR(event);
        }
        lbr_desc->event = event;
        pmu->event_count++;
@@ -320,7 +320,7 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
        if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
                return false;
 
-       if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu))
+       if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
                goto dummy;
 
        /*
index e0a3a9be654bac8f4de80d1e673c652e4180854c..50810d4714628c2f3978847e9068c6106529797a 100644 (file)
@@ -4277,7 +4277,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        */
        exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 
-       if (!enable_pml)
+       /*
+        * PML is enabled/disabled when dirty logging of memsmlots changes, but
+        * it needs to be set here when dirty logging is already active, e.g.
+        * if this vCPU was created after dirty logging was enabled.
+        */
+       if (!vcpu->kvm->arch.cpu_dirty_logging_count)
                exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
        if (cpu_has_vmx_xsaves()) {
@@ -4295,18 +4300,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        }
 
        vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
-
-       /*
-        * Expose INVPCID if and only if PCID is also exposed to the guest.
-        * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
-        * if CR4.PCIDE=0.  Enumerating CPUID.INVPCID=1 would lead to incorrect
-        * behavior from the guest perspective (it would expect #GP or #PF).
-        */
-       if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
-               guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
        vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
 
-
        vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
        vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
 
@@ -5776,24 +5771,6 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
        vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 }
 
-/*
- * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
- * Called before reporting dirty_bitmap to userspace.
- */
-static void kvm_flush_pml_buffers(struct kvm *kvm)
-{
-       int i;
-       struct kvm_vcpu *vcpu;
-       /*
-        * We only need to kick vcpu out of guest mode here, as PML buffer
-        * is flushed at beginning of all VMEXITs, and it's obvious that only
-        * vcpus running in guest are possible to have unflushed GPAs in PML
-        * buffer.
-        */
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               kvm_vcpu_kick(vcpu);
-}
-
 static void vmx_dump_sel(char *name, uint32_t sel)
 {
        pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
@@ -5976,9 +5953,10 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
         * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
         * querying dirty_bitmap, we only need to kick all vcpus out of guest
         * mode as if vcpus is in root mode, the PML buffer must has been
-        * flushed already.
+        * flushed already.  Note, PML is never enabled in hardware while
+        * running L2.
         */
-       if (enable_pml)
+       if (enable_pml && !is_guest_mode(vcpu))
                vmx_flush_pml_buffer(vcpu);
 
        /*
@@ -5994,6 +5972,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
                return handle_invalid_guest_state(vcpu);
 
        if (is_guest_mode(vcpu)) {
+               /*
+                * PML is never enabled when running L2, bail immediately if a
+                * PML full exit occurs as something is horribly wrong.
+                */
+               if (exit_reason.basic == EXIT_REASON_PML_FULL)
+                       goto unexpected_vmexit;
+
                /*
                 * The host physical addresses of some pages of guest memory
                 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
@@ -6851,13 +6836,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
                kvm_machine_check();
 
+       if (likely(!vmx->exit_reason.failed_vmentry))
+               vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
        trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
 
        if (unlikely(vmx->exit_reason.failed_vmentry))
                return EXIT_FASTPATH_NONE;
 
        vmx->loaded_vmcs->launched = 1;
-       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
@@ -7330,8 +7317,8 @@ static __init void vmx_set_cpu_caps(void)
        /* CPUID 0x7 */
        if (kvm_mpx_supported())
                kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
-       if (cpu_has_vmx_invpcid())
-               kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+       if (!cpu_has_vmx_invpcid())
+               kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
        if (vmx_pt_mode_is_host_guest())
                kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
@@ -7509,30 +7496,24 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
                shrink_ple_window(vcpu);
 }
 
-static void vmx_slot_enable_log_dirty(struct kvm *kvm,
-                                    struct kvm_memory_slot *slot)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
 {
-       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
-               kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
-       kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
-}
-
-static void vmx_slot_disable_log_dirty(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot)
-{
-       kvm_mmu_slot_set_dirty(kvm, slot);
-}
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-static void vmx_flush_log_dirty(struct kvm *kvm)
-{
-       kvm_flush_pml_buffers(kvm);
-}
+       if (is_guest_mode(vcpu)) {
+               vmx->nested.update_vmcs01_cpu_dirty_logging = true;
+               return;
+       }
 
-static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
-                                          struct kvm_memory_slot *memslot,
-                                          gfn_t offset, unsigned long mask)
-{
-       kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+       /*
+        * Note, cpu_dirty_logging_count can be changed concurrent with this
+        * code, but in that case another update request will be made and so
+        * the guest will never run with a stale PML value.
+        */
+       if (vcpu->kvm->arch.cpu_dirty_logging_count)
+               secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+       else
+               secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
 }
 
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -7642,11 +7623,6 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
        return supported & BIT(bit);
 }
 
-static int vmx_cpu_dirty_log_size(void)
-{
-       return enable_pml ? PML_ENTITY_NUM : 0;
-}
-
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .hardware_unsetup = hardware_unsetup,
 
@@ -7746,10 +7722,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
        .sched_in = vmx_sched_in,
 
-       .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
-       .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
-       .flush_log_dirty = vmx_flush_log_dirty,
-       .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+       .cpu_dirty_log_size = PML_ENTITY_NUM,
+       .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
 
        .pre_block = vmx_pre_block,
        .post_block = vmx_post_block,
@@ -7777,7 +7751,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
        .msr_filter_changed = vmx_msr_filter_changed,
        .complete_emulated_msr = kvm_complete_insn_gp,
-       .cpu_dirty_log_size = vmx_cpu_dirty_log_size,
 
        .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
 };
@@ -7894,13 +7867,8 @@ static __init int hardware_setup(void)
        if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
                enable_pml = 0;
 
-       if (!enable_pml) {
-               vmx_x86_ops.slot_enable_log_dirty = NULL;
-               vmx_x86_ops.slot_disable_log_dirty = NULL;
-               vmx_x86_ops.flush_log_dirty = NULL;
-               vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
-               vmx_x86_ops.cpu_dirty_log_size = NULL;
-       }
+       if (!enable_pml)
+               vmx_x86_ops.cpu_dirty_log_size = 0;
 
        if (!cpu_has_vmx_preemption_timer())
                enable_preemption_timer = false;
index 12c53d05a902becd08fa8f07904cba623c32cbd0..89da5e1251f18c115c8436aca484b3656151a0cf 100644 (file)
@@ -165,6 +165,7 @@ struct nested_vmx {
 
        bool change_vmcs01_virtual_apic_mode;
        bool reload_vmcs01_apic_access_page;
+       bool update_vmcs01_cpu_dirty_logging;
 
        /*
         * Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -393,6 +394,7 @@ int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
        u32 msr, int type, bool value);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
 
 static inline u8 vmx_get_rvi(void)
 {
index 884e5b3838c735db2e772c80638667fcdb390866..3712bb5245eb9a206b872bb69e476d172ef343ed 100644 (file)
@@ -5215,10 +5215,18 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
+
        /*
-        * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+        * Flush all CPUs' dirty log buffers to the  dirty_bitmap.  Called
+        * before reporting dirty_bitmap to userspace.  KVM flushes the buffers
+        * on all VM-Exits, thus we only need to kick running vCPUs to force a
+        * VM-Exit.
         */
-       static_call_cond(kvm_x86_flush_log_dirty)(kvm);
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vcpu_kick(vcpu);
 }
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -8980,6 +8988,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_check_async_pf_completion(vcpu);
                if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
                        static_call(kvm_x86_msr_filter_changed)(vcpu);
+
+               if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
+                       static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
        }
 
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -10748,75 +10759,96 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        return 0;
 }
 
+
+static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
+{
+       struct kvm_arch *ka = &kvm->arch;
+
+       if (!kvm_x86_ops.cpu_dirty_log_size)
+               return;
+
+       if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
+           (!enable && --ka->cpu_dirty_logging_count == 0))
+               kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
+
+       WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
+}
+
 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                                     struct kvm_memory_slot *old,
                                     struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
 {
+       bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
        /*
-        * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
-        * See comments below.
+        * Update CPU dirty logging if dirty logging is being toggled.  This
+        * applies to all operations.
         */
-       if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
-               return;
+       if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
+               kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
 
        /*
-        * Dirty logging tracks sptes in 4k granularity, meaning that large
-        * sptes have to be split.  If live migration is successful, the guest
-        * in the source machine will be destroyed and large sptes will be
-        * created in the destination. However, if the guest continues to run
-        * in the source machine (for example if live migration fails), small
-        * sptes will remain around and cause bad performance.
-        *
-        * Scan sptes if dirty logging has been stopped, dropping those
-        * which can be collapsed into a single large-page spte.  Later
-        * page faults will create the large-page sptes.
+        * Nothing more to do for RO slots (which can't be dirtied and can't be
+        * made writable) or CREATE/MOVE/DELETE of a slot.
         *
-        * There is no need to do this in any of the following cases:
+        * For a memslot with dirty logging disabled:
         * CREATE:      No dirty mappings will already exist.
         * MOVE/DELETE: The old mappings will already have been cleaned up by
         *              kvm_arch_flush_shadow_memslot()
+        *
+        * For a memslot with dirty logging enabled:
+        * CREATE:      No shadow pages exist, thus nothing to write-protect
+        *              and no dirty bits to clear.
+        * MOVE/DELETE: The old mappings will already have been cleaned up by
+        *              kvm_arch_flush_shadow_memslot().
         */
-       if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
-           !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
-               kvm_mmu_zap_collapsible_sptes(kvm, new);
+       if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+               return;
 
        /*
-        * Enable or disable dirty logging for the slot.
-        *
-        * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
-        * slot have been zapped so no dirty logging updates are needed for
-        * the old slot.
-        * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
-        * any mappings that might be created in it will consume the
-        * properties of the new slot and do not need to be updated here.
-        *
-        * When PML is enabled, the kvm_x86_ops dirty logging hooks are
-        * called to enable/disable dirty logging.
-        *
-        * When disabling dirty logging with PML enabled, the D-bit is set
-        * for sptes in the slot in order to prevent unnecessary GPA
-        * logging in the PML buffer (and potential PML buffer full VMEXIT).
-        * This guarantees leaving PML enabled for the guest's lifetime
-        * won't have any additional overhead from PML when the guest is
-        * running with dirty logging disabled.
-        *
-        * When enabling dirty logging, large sptes are write-protected
-        * so they can be split on first write.  New large sptes cannot
-        * be created for this slot until the end of the logging.
-        * See the comments in fast_page_fault().
-        * For small sptes, nothing is done if the dirty log is in the
-        * initial-all-set state.  Otherwise, depending on whether pml
-        * is enabled the D-bit or the W-bit will be cleared.
+        * READONLY and non-flags changes were filtered out above, and the only
+        * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
+        * logging isn't being toggled on or off.
         */
-       if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-               if (kvm_x86_ops.slot_enable_log_dirty) {
-                       static_call(kvm_x86_slot_enable_log_dirty)(kvm, new);
-               } else {
-                       int level =
-                               kvm_dirty_log_manual_protect_and_init_set(kvm) ?
-                               PG_LEVEL_2M : PG_LEVEL_4K;
+       if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+               return;
+
+       if (!log_dirty_pages) {
+               /*
+                * Dirty logging tracks sptes in 4k granularity, meaning that
+                * large sptes have to be split.  If live migration succeeds,
+                * the guest in the source machine will be destroyed and large
+                * sptes will be created in the destination.  However, if the
+                * guest continues to run in the source machine (for example if
+                * live migration fails), small sptes will remain around and
+                * cause bad performance.
+                *
+                * Scan sptes if dirty logging has been stopped, dropping those
+                * which can be collapsed into a single large-page spte.  Later
+                * page faults will create the large-page sptes.
+                */
+               kvm_mmu_zap_collapsible_sptes(kvm, new);
+       } else {
+               /* By default, write-protect everything to log writes. */
+               int level = PG_LEVEL_4K;
+
+               if (kvm_x86_ops.cpu_dirty_log_size) {
+                       /*
+                        * Clear all dirty bits, unless pages are treated as
+                        * dirty from the get-go.
+                        */
+                       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+                               kvm_mmu_slot_leaf_clear_dirty(kvm, new);
 
+                       /*
+                        * Write-protect large pages on write so that dirty
+                        * logging happens at 4k granularity.  No need to
+                        * write-protect small SPTEs since write accesses are
+                        * logged by the CPU via dirty bits.
+                        */
+                       level = PG_LEVEL_2M;
+               } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
                        /*
                         * If we're with initial-all-set, we don't need
                         * to write protect any small page because
@@ -10825,10 +10857,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                         * so that the page split can happen lazily on
                         * the first write to the huge page.
                         */
-                       kvm_mmu_slot_remove_write_access(kvm, new, level);
+                       level = PG_LEVEL_2M;
                }
-       } else {
-               static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new);
+               kvm_mmu_slot_remove_write_access(kvm, new, level);
        }
 }
 
index e126ebda36d0a0f84545e07b9ecf05c13927dbb5..1b65e7204344a55d195ec25c299453a68d8c046b 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/bug.h>
+#include <linux/minmax.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/preempt.h>
@@ -506,6 +507,8 @@ struct kvm {
        struct mmu_notifier mmu_notifier;
        unsigned long mmu_notifier_seq;
        long mmu_notifier_count;
+       unsigned long mmu_notifier_range_start;
+       unsigned long mmu_notifier_range_end;
 #endif
        long tlbs_dirty;
        struct list_head devices;
@@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
                               bool atomic, bool *async, bool write_fault,
-                              bool *writable);
+                              bool *writable, hva_t *hva);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
                return 1;
        return 0;
 }
+
+static inline int mmu_notifier_retry_hva(struct kvm *kvm,
+                                        unsigned long mmu_seq,
+                                        unsigned long hva)
+{
+       lockdep_assert_held(&kvm->mmu_lock);
+       /*
+        * If mmu_notifier_count is non-zero, then the range maintained by
+        * kvm_mmu_notifier_invalidate_range_start contains all addresses that
+        * might be being invalidated. Note that it may include some false
+        * positives, due to shortcuts when handing concurrent invalidations.
+        */
+       if (unlikely(kvm->mmu_notifier_count) &&
+           hva >= kvm->mmu_notifier_range_start &&
+           hva < kvm->mmu_notifier_range_end)
+               return 1;
+       if (kvm->mmu_notifier_seq != mmu_seq)
+               return 1;
+       return 0;
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
index 3a84394829ea5e8c2b02533173d1dae255265cac..32b87cc77c8e05b0b2646a4af462875017b2ed20 100644 (file)
@@ -33,6 +33,7 @@
 /demand_paging_test
 /dirty_log_test
 /dirty_log_perf_test
+/hardware_disable_test
 /kvm_create_max_vcpus
 /memslot_modification_stress_test
 /set_memory_region_test
index 8c8eda429576e9d8b4cbbe0be8a9950f2f3aca4b..a6d61f451f884048e5b4af5418c008ca00dcf7b8 100644 (file)
@@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c
new file mode 100644 (file)
index 0000000..2f2eeb8
--- /dev/null
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This test is intended to reproduce a crash that happens when
+ * kvm_arch_hardware_disable is called and it attempts to unregister the user
+ * return notifiers.
+ */
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+#include <test_util.h>
+
+#include "kvm_util.h"
+
+#define VCPU_NUM 4
+#define SLEEPING_THREAD_NUM (1 << 4)
+#define FORK_NUM (1ULL << 9)
+#define DELAY_US_MAX 2000
+#define GUEST_CODE_PIO_PORT 4
+
+sem_t *sem;
+
+/* Arguments for the pthreads */
+struct payload {
+       struct kvm_vm *vm;
+       uint32_t index;
+};
+
+static void guest_code(void)
+{
+       for (;;)
+               ;  /* Some busy work */
+       printf("Should not be reached.\n");
+}
+
+static void *run_vcpu(void *arg)
+{
+       struct payload *payload = (struct payload *)arg;
+       struct kvm_run *state = vcpu_state(payload->vm, payload->index);
+
+       vcpu_run(payload->vm, payload->index);
+
+       TEST_ASSERT(false, "%s: exited with reason %d: %s\n",
+                   __func__, state->exit_reason,
+                   exit_reason_str(state->exit_reason));
+       pthread_exit(NULL);
+}
+
+static void *sleeping_thread(void *arg)
+{
+       int fd;
+
+       while (true) {
+               fd = open("/dev/null", O_RDWR);
+               close(fd);
+       }
+       TEST_ASSERT(false, "%s: exited\n", __func__);
+       pthread_exit(NULL);
+}
+
+static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr,
+                                      void *(*f)(void *), void *arg)
+{
+       int r;
+
+       r = pthread_create(thread, attr, f, arg);
+       TEST_ASSERT(r == 0, "%s: failed to create thread", __func__);
+}
+
+static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set)
+{
+       int r;
+
+       r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set);
+       TEST_ASSERT(r == 0, "%s: failed set affinity", __func__);
+}
+
+static inline void check_join(pthread_t thread, void **retval)
+{
+       int r;
+
+       r = pthread_join(thread, retval);
+       TEST_ASSERT(r == 0, "%s: failed to join thread", __func__);
+}
+
+static void run_test(uint32_t run)
+{
+       struct kvm_vm *vm;
+       cpu_set_t cpu_set;
+       pthread_t threads[VCPU_NUM];
+       pthread_t throw_away;
+       struct payload payloads[VCPU_NUM];
+       void *b;
+       uint32_t i, j;
+
+       CPU_ZERO(&cpu_set);
+       for (i = 0; i < VCPU_NUM; i++)
+               CPU_SET(i, &cpu_set);
+
+       vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+       kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+       vm_create_irqchip(vm);
+
+       fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run);
+       for (i = 0; i < VCPU_NUM; ++i) {
+               vm_vcpu_add_default(vm, i, guest_code);
+               payloads[i].vm = vm;
+               payloads[i].index = i;
+
+               check_create_thread(&threads[i], NULL, run_vcpu,
+                                   (void *)&payloads[i]);
+               check_set_affinity(threads[i], &cpu_set);
+
+               for (j = 0; j < SLEEPING_THREAD_NUM; ++j) {
+                       check_create_thread(&throw_away, NULL, sleeping_thread,
+                                           (void *)NULL);
+                       check_set_affinity(throw_away, &cpu_set);
+               }
+       }
+       fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run);
+       sem_post(sem);
+       for (i = 0; i < VCPU_NUM; ++i)
+               check_join(threads[i], &b);
+       /* Should not be reached */
+       TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run);
+}
+
+int main(int argc, char **argv)
+{
+       uint32_t i;
+       int s, r;
+       pid_t pid;
+
+       sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0);
+       sem_unlink("vm_sem");
+
+       for (i = 0; i < FORK_NUM; ++i) {
+               pid = fork();
+               TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__);
+               if (pid == 0)
+                       run_test(i); /* This function always exits */
+
+               fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i);
+               sem_wait(sem);
+               r = (rand() % DELAY_US_MAX) + 1;
+               fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r);
+               usleep(r);
+               r = waitpid(pid, &s, WNOHANG);
+               TEST_ASSERT(r != pid,
+                           "%s: [%d] child exited unexpectedly status: [%d]",
+                           __func__, i, s);
+               fprintf(stderr, "%s: [%d] killing child\n", __func__, i);
+               kill(pid, SIGKILL);
+       }
+
+       sem_destroy(sem);
+       exit(0);
+}
index de0c76177d02a74cedbc0fe318b18867682e6a17..a8906e60a1081ab3ef60005921a8469faab72d13 100644 (file)
@@ -720,7 +720,8 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        struct kvm_cpuid2 *cpuid;
-       int rc, max_ent;
+       int max_ent;
+       int rc = -1;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
index 001b9de4e727cd39f5f3097443756656ab211aa5..383df23514b9390c467379ec9a739eda14a72bf2 100644 (file)
@@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         * count is also read inside the mmu_lock critical section.
         */
        kvm->mmu_notifier_count++;
+       if (likely(kvm->mmu_notifier_count == 1)) {
+               kvm->mmu_notifier_range_start = range->start;
+               kvm->mmu_notifier_range_end = range->end;
+       } else {
+               /*
+                * Fully tracking multiple concurrent ranges has dimishing
+                * returns. Keep things simple and just find the minimal range
+                * which includes the current and new ranges. As there won't be
+                * enough information to subtract a range after its invalidate
+                * completes, any ranges invalidated concurrently will
+                * accumulate and persist until all outstanding invalidates
+                * complete.
+                */
+               kvm->mmu_notifier_range_start =
+                       min(kvm->mmu_notifier_range_start, range->start);
+               kvm->mmu_notifier_range_end =
+                       max(kvm->mmu_notifier_range_end, range->end);
+       }
        need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
                                             range->flags);
        /* we've to flush the tlb before the pages can be freed */
@@ -2023,10 +2041,13 @@ exit:
 
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
                               bool atomic, bool *async, bool write_fault,
-                              bool *writable)
+                              bool *writable, hva_t *hva)
 {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
+       if (hva)
+               *hva = addr;
+
        if (addr == KVM_HVA_ERR_RO_BAD) {
                if (writable)
                        *writable = false;
@@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable)
 {
        return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-                                   write_fault, writable);
+                                   write_fault, writable, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);