Merge tag 'kvm-arm-for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm...
authorPaolo Bonzini <pbonzini@redhat.com>
Thu, 11 Jul 2019 13:14:16 +0000 (15:14 +0200)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 11 Jul 2019 13:14:16 +0000 (15:14 +0200)
KVM/arm updates for 5.3

- Add support for chained PMU counters in guests
- Improve SError handling
- Handle Neoverse N1 erratum #1349291
- Allow side-channel mitigation status to be migrated
- Standardise most AArch64 system register accesses to msr_s/mrs_s
- Fix host MPIDR corruption on 32bit

63 files changed:
Documentation/virtual/index.rst [new file with mode: 0644]
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/cpuid.rst [new file with mode: 0644]
Documentation/virtual/kvm/cpuid.txt [deleted file]
Documentation/virtual/kvm/hypercalls.txt
Documentation/virtual/kvm/index.rst [new file with mode: 0644]
Documentation/virtual/kvm/locking.txt
Documentation/virtual/kvm/msr.txt
Documentation/virtual/paravirt_ops.rst [moved from Documentation/virtual/paravirt_ops.txt with 65% similarity]
arch/mips/kvm/mips.c
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/include/uapi/asm/kvm_para.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/kvm.c
arch/x86/kvm/Kconfig
arch/x86/kvm/cpuid.c
arch/x86/kvm/emulate.c
arch/x86/kvm/irq.h
arch/x86/kvm/irq_comm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmutrace.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu.c
arch/x86/kvm/pmu.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx/evmcs.c
arch/x86/kvm/vmx/evmcs.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/nested.h
arch/x86/kvm/vmx/ops.h
arch/x86/kvm/vmx/vmcs.h
arch/x86/kvm/vmx/vmcs12.h
arch/x86/kvm/vmx/vmcs_shadow_fields.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm_host.h
include/uapi/linux/kvm.h
include/uapi/linux/kvm_para.h
tools/include/uapi/linux/kvm.h
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/include/aarch64/processor.h
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/lib/aarch64/processor.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/kvm_util_internal.h
tools/testing/selftests/kvm/lib/ucall.c
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/x86_64/evmcs_test.c
tools/testing/selftests/kvm/x86_64/kvm_create_max_vcpus.c
tools/testing/selftests/kvm/x86_64/smm_test.c
tools/testing/selftests/kvm/x86_64/state_test.c
virt/kvm/arm/arm.c
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

diff --git a/Documentation/virtual/index.rst b/Documentation/virtual/index.rst
new file mode 100644 (file)
index 0000000..062ffb5
--- /dev/null
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+Linux Virtualization Support
+============================
+
+.. toctree::
+   :maxdepth: 2
+
+   kvm/index
+   paravirt_ops
+
+.. only:: html and subproject
+
+   Indices
+   =======
+
+   * :ref:`genindex`
index 2a4531bb06bde708d1851072fa2023ccb11496b5..8a97a1a7eca2229961d084a9b83552b0d4a1b924 100644 (file)
@@ -4081,6 +4081,32 @@ KVM_ARM_VCPU_FINALIZE call.
 See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization
 using this ioctl.
 
+4.120 KVM_SET_PMU_EVENT_FILTER
+
+Capability: KVM_CAP_PMU_EVENT_FILTER
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_pmu_event_filter (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_pmu_event_filter {
+       __u32 action;
+       __u32 nevents;
+       __u64 events[0];
+};
+
+This ioctl restricts the set of PMU events that the guest can program.
+The argument holds a list of events which will be allowed or denied.
+The eventsel+umask of each event the guest attempts to program is compared
+against the events field to determine whether the guest should have access.
+This only affects general purpose counters; fixed purpose counters can
+be disabled by changing the perfmon CPUID leaf.
+
+Valid values for 'action':
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+
 5. The kvm_run structure
 ------------------------
 
@@ -4909,6 +4935,8 @@ Valid bits in args[0] are
 
 #define KVM_X86_DISABLE_EXITS_MWAIT            (1 << 0)
 #define KVM_X86_DISABLE_EXITS_HLT              (1 << 1)
+#define KVM_X86_DISABLE_EXITS_PAUSE            (1 << 2)
+#define KVM_X86_DISABLE_EXITS_CSTATE           (1 << 3)
 
 Enabling this capability on a VM provides userspace with a way to no
 longer intercept some instructions for improved latency in some
diff --git a/Documentation/virtual/kvm/cpuid.rst b/Documentation/virtual/kvm/cpuid.rst
new file mode 100644 (file)
index 0000000..01b081f
--- /dev/null
@@ -0,0 +1,107 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+KVM CPUID bits
+==============
+
+:Author: Glauber Costa <glommer@gmail.com>
+
+A guest running on a kvm host, can check some of its features using
+cpuid. This is not always guaranteed to work, since userspace can
+mask-out some, or even all KVM-related cpuid features before launching
+a guest.
+
+KVM cpuid functions are:
+
+function: KVM_CPUID_SIGNATURE (0x40000000)
+
+returns::
+
+   eax = 0x40000001
+   ebx = 0x4b4d564b
+   ecx = 0x564b4d56
+   edx = 0x4d
+
+Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+The value in eax corresponds to the maximum cpuid function present in this leaf,
+and will be updated if more functions are added in the future.
+Note also that old hosts set eax value to 0x0. This should
+be interpreted as if the value was 0x40000001.
+This function queries the presence of KVM cpuid leafs.
+
+function: define KVM_CPUID_FEATURES (0x40000001)
+
+returns::
+
+          ebx, ecx
+          eax = an OR'ed group of (1 << flag)
+
+where ``flag`` is defined as below:
+
+================================= =========== ================================
+flag                              value       meaning
+================================= =========== ================================
+KVM_FEATURE_CLOCKSOURCE           0           kvmclock available at msrs
+                                              0x11 and 0x12
+
+KVM_FEATURE_NOP_IO_DELAY          1           not necessary to perform delays
+                                              on PIO operations
+
+KVM_FEATURE_MMU_OP                2           deprecated
+
+KVM_FEATURE_CLOCKSOURCE2          3           kvmclock available at msrs
+
+                                              0x4b564d00 and 0x4b564d01
+KVM_FEATURE_ASYNC_PF              4           async pf can be enabled by
+                                              writing to msr 0x4b564d02
+
+KVM_FEATURE_STEAL_TIME            5           steal time can be enabled by
+                                              writing to msr 0x4b564d03
+
+KVM_FEATURE_PV_EOI                6           paravirtualized end of interrupt
+                                              handler can be enabled by
+                                              writing to msr 0x4b564d04
+
+KVM_FEATURE_PV_UNHAULT            7           guest checks this feature bit
+                                              before enabling paravirtualized
+                                              spinlock support
+
+KVM_FEATURE_PV_TLB_FLUSH          9           guest checks this feature bit
+                                              before enabling paravirtualized
+                                              tlb flush
+
+KVM_FEATURE_ASYNC_PF_VMEXIT       10          paravirtualized async PF VM EXIT
+                                              can be enabled by setting bit 2
+                                              when writing to msr 0x4b564d02
+
+KVM_FEATURE_PV_SEND_IPI           11          guest checks this feature bit
+                                              before enabling paravirtualized
+                                              sebd IPIs
+
+KVM_FEATURE_PV_POLL_CONTROL       12          host-side polling on HLT can
+                                              be disabled by writing
+                                              to msr 0x4b564d05.
+
+KVM_FEATURE_PV_SCHED_YIELD        13          guest checks this feature bit
+                                              before using paravirtualized
+                                              sched yield.
+
+KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24          host will warn if no guest-side
+                                              per-cpu warps are expeced in
+                                              kvmclock
+================================= =========== ================================
+
+::
+
+      edx = an OR'ed group of (1 << flag)
+
+Where ``flag`` here is defined as below:
+
+================== ============ =================================
+flag               value        meaning
+================== ============ =================================
+KVM_HINTS_REALTIME 0            guest checks this feature bit to
+                                determine that vCPUs are never
+                                preempted for an unlimited time
+                                allowing optimizations
+================== ============ =================================
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
deleted file mode 100644 (file)
index 97ca194..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-KVM CPUID bits
-Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
-=====================================================
-
-A guest running on a kvm host, can check some of its features using
-cpuid. This is not always guaranteed to work, since userspace can
-mask-out some, or even all KVM-related cpuid features before launching
-a guest.
-
-KVM cpuid functions are:
-
-function: KVM_CPUID_SIGNATURE (0x40000000)
-returns : eax = 0x40000001,
-          ebx = 0x4b4d564b,
-          ecx = 0x564b4d56,
-          edx = 0x4d.
-Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
-The value in eax corresponds to the maximum cpuid function present in this leaf,
-and will be updated if more functions are added in the future.
-Note also that old hosts set eax value to 0x0. This should
-be interpreted as if the value was 0x40000001.
-This function queries the presence of KVM cpuid leafs.
-
-
-function: define KVM_CPUID_FEATURES (0x40000001)
-returns : ebx, ecx
-          eax = an OR'ed group of (1 << flag), where each flags is:
-
-
-flag                               || value || meaning
-=============================================================================
-KVM_FEATURE_CLOCKSOURCE            ||     0 || kvmclock available at msrs
-                                   ||       || 0x11 and 0x12.
-------------------------------------------------------------------------------
-KVM_FEATURE_NOP_IO_DELAY           ||     1 || not necessary to perform delays
-                                   ||       || on PIO operations.
-------------------------------------------------------------------------------
-KVM_FEATURE_MMU_OP                 ||     2 || deprecated.
-------------------------------------------------------------------------------
-KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
-                                   ||       || 0x4b564d00 and 0x4b564d01
-------------------------------------------------------------------------------
-KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
-                                   ||       || writing to msr 0x4b564d02
-------------------------------------------------------------------------------
-KVM_FEATURE_STEAL_TIME             ||     5 || steal time can be enabled by
-                                   ||       || writing to msr 0x4b564d03.
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_EOI                 ||     6 || paravirtualized end of interrupt
-                                   ||       || handler can be enabled by writing
-                                   ||       || to msr 0x4b564d04.
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
-                                   ||       || before enabling paravirtualized
-                                   ||       || spinlock support.
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_TLB_FLUSH           ||     9 || guest checks this feature bit
-                                   ||       || before enabling paravirtualized
-                                   ||       || tlb flush.
-------------------------------------------------------------------------------
-KVM_FEATURE_ASYNC_PF_VMEXIT        ||    10 || paravirtualized async PF VM exit
-                                   ||       || can be enabled by setting bit 2
-                                   ||       || when writing to msr 0x4b564d02
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_SEND_IPI            ||    11 || guest checks this feature bit
-                                   ||       || before using paravirtualized
-                                   ||       || send IPIs.
-------------------------------------------------------------------------------
-KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
-                                   ||       || per-cpu warps are expected in
-                                   ||       || kvmclock.
-------------------------------------------------------------------------------
-
-          edx = an OR'ed group of (1 << flag), where each flags is:
-
-
-flag                               || value || meaning
-==================================================================================
-KVM_HINTS_REALTIME                 ||     0 || guest checks this feature bit to
-                                   ||       || determine that vCPUs are never
-                                   ||       || preempted for an unlimited time,
-                                   ||       || allowing optimizations
-----------------------------------------------------------------------------------
index da24c138c8d131bea63aeb652af3d6fc373514ef..da210651f71486dde5e27bcbee3ab62aefa25080 100644 (file)
@@ -141,3 +141,14 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1
 corresponds to the APIC ID a2+1, and so on.
 
 Returns the number of CPUs to which the IPIs were delivered successfully.
+
+7. KVM_HC_SCHED_YIELD
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to yield if the IPI target vCPU is preempted
+
+a0: destination APIC ID
+
+Usage example: When sending a call-function IPI-many to vCPUs, yield if
+any of the IPI target vCPUs was preempted.
diff --git a/Documentation/virtual/kvm/index.rst b/Documentation/virtual/kvm/index.rst
new file mode 100644 (file)
index 0000000..0b206a0
--- /dev/null
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+KVM
+===
+
+.. toctree::
+   :maxdepth: 2
+
+   amd-memory-encryption
+   cpuid
index 1bb8bcaf8497703f7cdd61538ca1374f0e8ac622..635cd6eaf71495e081de44774e489d622323fcf4 100644 (file)
@@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows:
 
 On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
 
-For spinlocks, kvm_lock is taken outside kvm->mmu_lock.
-
 Everything else is a leaf: no other lock is taken inside the critical
 sections.
 
@@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above.
 ------------
 
 Name:          kvm_lock
-Type:          spinlock_t
+Type:          mutex
 Arch:          any
 Protects:      - vm_list
 
index f3f0d57ced8e1827fe8e8e6dc49808b18c9253fb..df1f4338b3caf3e466e783fec42b767cef1ad72c 100644 (file)
@@ -273,3 +273,12 @@ MSR_KVM_EOI_EN: 0x4b564d04
        guest must both read the least significant bit in the memory area and
        clear it using a single CPU instruction, such as test and clear, or
        compare and exchange.
+
+MSR_KVM_POLL_CONTROL: 0x4b564d05
+       Control host-side polling.
+
+       data: Bit 0 enables (1) or disables (0) host-side HLT polling logic.
+
+       KVM guests can request the host not to poll on HLT, for example if
+       they are performing polling themselves.
+
similarity index 65%
rename from Documentation/virtual/paravirt_ops.txt
rename to Documentation/virtual/paravirt_ops.rst
index d4881c00e3396aa9da584f575c5e69f244949dc1..6b789d27cead44d22d1b6a4b299b651ad84b1c4f 100644 (file)
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
 Paravirt_ops
 ============
 
@@ -18,15 +21,15 @@ at boot time.
 pv_ops operations are classified into three categories:
 
 - simple indirect call
-  These operations correspond to high level functionality where it is
-  known that the overhead of indirect call isn't very important.
+   These operations correspond to high level functionality where it is
+   known that the overhead of indirect call isn't very important.
 
 - indirect call which allows optimization with binary patch
-  Usually these operations correspond to low level critical instructions. They
-  are called frequently and are performance critical. The overhead is
-  very important.
+   Usually these operations correspond to low level critical instructions. They
+   are called frequently and are performance critical. The overhead is
+   very important.
 
 - a set of macros for hand written assembly code
-  Hand written assembly codes (.S files) also need paravirtualization
-  because they include sensitive instructions or some of code paths in
-  them are very performance critical.
+   Hand written assembly codes (.S files) also need paravirtualization
+   because they include sensitive instructions or some of code paths in
+   them are very performance critical.
index 0369f26ab96d6aeb934d7f6e084e9789419fd03e..2cfe839f0b3a776898595c480d66807d161910f4 100644 (file)
@@ -123,9 +123,9 @@ int kvm_arch_hardware_setup(void)
        return 0;
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-       *(int *)rtn = 0;
+       return 0;
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
index 6d704ad2472b4e9eca7c5b6e65109bbede70a8a0..0dba7eb24f92072616460ae7cee6f90205025917 100644 (file)
@@ -414,9 +414,9 @@ int kvm_arch_hardware_setup(void)
        return 0;
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-       *(int *)rtn = kvmppc_core_check_processor_compat();
+       return kvmppc_core_check_processor_compat();
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
index 2b00a3ebee08627f89653ec8073cc1f2c7854b84..da5825a3c16bf420392af97b8ec9478722a516ad 100644 (file)
@@ -905,7 +905,6 @@ extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
 extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
 
 static inline void kvm_arch_hardware_disable(void) {}
-static inline void kvm_arch_check_processor_compat(void *rtn) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
index 28ebd647784c93b784bf625dc5063f02c5df3c30..0fef9192f6acdf5e5bb5d5ba043947fa4dc7495d 100644 (file)
@@ -227,6 +227,11 @@ int kvm_arch_hardware_enable(void)
        return 0;
 }
 
+int kvm_arch_check_processor_compat(void)
+{
+       return 0;
+}
+
 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
                              unsigned long end);
 
@@ -2418,13 +2423,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
        if (!kvm->arch.sca)
                goto out_err;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        sca_offset += 16;
        if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
                sca_offset = 0;
        kvm->arch.sca = (struct bsca_block *)
                        ((char *) kvm->arch.sca + sca_offset);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        sprintf(debug_name, "kvm-%u", current->pid);
 
index 26d1eb83f72a1791393514721ef3916fcc5087d1..0cc5b611a113fe9f1942a36a5c96a192155471b3 100644 (file)
@@ -686,6 +686,7 @@ struct kvm_vcpu_arch {
        u32 virtual_tsc_mult;
        u32 virtual_tsc_khz;
        s64 ia32_tsc_adjust_msr;
+       u64 msr_ia32_power_ctl;
        u64 tsc_scaling_ratio;
 
        atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
@@ -752,6 +753,8 @@ struct kvm_vcpu_arch {
                struct gfn_to_hva_cache data;
        } pv_eoi;
 
+       u64 msr_kvm_poll_control;
+
        /*
         * Indicate whether the access faults on its page table in guest
         * which is set when fix page fault and used to detect unhandeable
@@ -879,6 +882,7 @@ struct kvm_arch {
        bool mwait_in_guest;
        bool hlt_in_guest;
        bool pause_in_guest;
+       bool cstate_in_guest;
 
        unsigned long irq_sources_bitmap;
        s64 kvmclock_offset;
@@ -926,6 +930,8 @@ struct kvm_arch {
 
        bool guest_can_read_msr_platform_info;
        bool exception_payload_enabled;
+
+       struct kvm_pmu_event_filter *pmu_event_filter;
 };
 
 struct kvm_vm_stat {
@@ -996,7 +1002,7 @@ struct kvm_x86_ops {
        int (*disabled_by_bios)(void);             /* __init */
        int (*hardware_enable)(void);
        void (*hardware_disable)(void);
-       void (*check_processor_compatibility)(void *rtn);
+       int (*check_processor_compatibility)(void);/* __init */
        int (*hardware_setup)(void);               /* __init */
        void (*hardware_unsetup)(void);            /* __exit */
        bool (*cpu_has_accelerated_tpr)(void);
@@ -1110,7 +1116,7 @@ struct kvm_x86_ops {
        int (*check_intercept)(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage);
-       void (*handle_external_intr)(struct kvm_vcpu *vcpu);
+       void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
        bool (*mpx_supported)(void);
        bool (*xsaves_supported)(void);
        bool (*umip_emulated)(void);
@@ -1529,7 +1535,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
                    unsigned long ipi_bitmap_high, u32 min,
                    unsigned long icr, int op_64_bit);
 
-u64 kvm_get_arch_capabilities(void);
 void kvm_define_shared_msr(unsigned index, u32 msr);
 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
index d6ab5b4d15e543800a7a7524517b495fa6305074..e901b0ab116ff81dc8a77237f68afb9fe5948d35 100644 (file)
@@ -378,10 +378,11 @@ struct kvm_sync_regs {
        struct kvm_vcpu_events events;
 };
 
-#define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED    (1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE  (1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP   (1 << 3)
+#define KVM_X86_QUIRK_LINT0_REENABLED     (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED       (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE     (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP      (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
 
 #define KVM_STATE_NESTED_FORMAT_VMX    0
 #define KVM_STATE_NESTED_FORMAT_SVM    1       /* unused */
@@ -432,4 +433,14 @@ struct kvm_nested_state {
        } data;
 };
 
+/* for KVM_CAP_PMU_EVENT_FILTER */
+struct kvm_pmu_event_filter {
+       __u32 action;
+       __u32 nevents;
+       __u64 events[0];
+};
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
 #endif /* _ASM_X86_KVM_H */
index 19980ec1a316e8dc80a513948d5a5dff7b48fbda..2a8e0b6b9805a4c5f84ca4b2bf0c0e4ae74fcc02 100644 (file)
@@ -29,6 +29,8 @@
 #define KVM_FEATURE_PV_TLB_FLUSH       9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT    10
 #define KVM_FEATURE_PV_SEND_IPI        11
+#define KVM_FEATURE_POLL_CONTROL       12
+#define KVM_FEATURE_PV_SCHED_YIELD     13
 
 #define KVM_HINTS_REALTIME      0
 
@@ -47,6 +49,7 @@
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 #define MSR_KVM_STEAL_TIME  0x4b564d03
 #define MSR_KVM_PV_EOI_EN      0x4b564d04
+#define MSR_KVM_POLL_CONTROL   0x4b564d05
 
 struct kvm_steal_time {
        __u64 steal;
index d213ec5c3766db0dd5176c951b13e5f3c1514cfb..f0b0c90dd398246eb2882050d69c6b53ccca11af 100644 (file)
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
-#define VMX_ABORT_VMCS_CORRUPTED             3
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
 
 #endif /* _UAPIVMX_H */
index 5169b8cc35bb2d99c322c3e607d9813e083659fc..82caf01b63dd964148bbef9c8a5e2d920b802d4c 100644 (file)
@@ -527,6 +527,21 @@ static void kvm_setup_pv_ipi(void)
        pr_info("KVM setup pv IPIs\n");
 }
 
+static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
+{
+       int cpu;
+
+       native_send_call_func_ipi(mask);
+
+       /* Make sure other vCPUs get a chance to run if they need to. */
+       for_each_cpu(cpu, mask) {
+               if (vcpu_is_preempted(cpu)) {
+                       kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
+                       break;
+               }
+       }
+}
+
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
        native_smp_prepare_cpus(max_cpus);
@@ -638,6 +653,12 @@ static void __init kvm_guest_init(void)
 #ifdef CONFIG_SMP
        smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+       if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
+           !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+           kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+               smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
+               pr_info("KVM setup pv sched yield\n");
+       }
        if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
                                      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
                pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
index fc042419e670b10199808251d8f7310454c283e2..840e12583b85bac9dd46d0628833e43a12f073f0 100644 (file)
@@ -41,6 +41,7 @@ config KVM
        select PERF_EVENTS
        select HAVE_KVM_MSI
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+       select HAVE_KVM_NO_POLL
        select KVM_GENERIC_DIRTYLOG_READ_PROTECT
        select KVM_VFIO
        select SRCU
index 4992e7c99588f390b6e7faf4badbbd6b20aef9ef..ead6812103063a4433aaacd201912c2c62395d1d 100644 (file)
@@ -134,6 +134,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
                best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
 
+       if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
+               best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+               if (best) {
+                       if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT)
+                               best->ecx |= F(MWAIT);
+                       else
+                               best->ecx &= ~F(MWAIT);
+               }
+       }
+
        /* Update physical-address width */
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
        kvm_mmu_reset_context(vcpu);
@@ -276,19 +286,38 @@ static void cpuid_mask(u32 *word, int wordnum)
        *word &= boot_cpu_data.x86_capability[wordnum];
 }
 
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
                           u32 index)
 {
        entry->function = function;
        entry->index = index;
+       entry->flags = 0;
+
        cpuid_count(entry->function, entry->index,
                    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
-       entry->flags = 0;
+
+       switch (function) {
+       case 2:
+               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+               break;
+       case 4:
+       case 7:
+       case 0xb:
+       case 0xd:
+       case 0x14:
+       case 0x8000001d:
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               break;
+       }
 }
 
-static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
-                                  u32 func, u32 index, int *nent, int maxnent)
+static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
+                                   u32 func, int *nent, int maxnent)
 {
+       entry->function = func;
+       entry->index = 0;
+       entry->flags = 0;
+
        switch (func) {
        case 0:
                entry->eax = 7;
@@ -300,21 +329,83 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
                break;
        case 7:
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               if (index == 0)
-                       entry->ecx = F(RDPID);
+               entry->eax = 0;
+               entry->ecx = F(RDPID);
                ++*nent;
        default:
                break;
        }
 
-       entry->function = func;
-       entry->index = index;
-
        return 0;
 }
 
-static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-                                u32 index, int *nent, int maxnent)
+static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
+{
+       unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
+       unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
+       unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+       unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+       unsigned f_la57;
+
+       /* cpuid 7.0.ebx */
+       const u32 kvm_cpuid_7_0_ebx_x86_features =
+               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+               F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+               F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+               F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+               F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
+
+       /* cpuid 7.0.ecx*/
+       const u32 kvm_cpuid_7_0_ecx_x86_features =
+               F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+               F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+               F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+
+       /* cpuid 7.0.edx*/
+       const u32 kvm_cpuid_7_0_edx_x86_features =
+               F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+               F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+               F(MD_CLEAR);
+
+       switch (index) {
+       case 0:
+               entry->eax = 0;
+               entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+               cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
+               /* TSC_ADJUST is emulated */
+               entry->ebx |= F(TSC_ADJUST);
+
+               entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+               f_la57 = entry->ecx & F(LA57);
+               cpuid_mask(&entry->ecx, CPUID_7_ECX);
+               /* Set LA57 based on hardware capability. */
+               entry->ecx |= f_la57;
+               entry->ecx |= f_umip;
+               /* PKU is not yet implemented for shadow paging. */
+               if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+                       entry->ecx &= ~F(PKU);
+
+               entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+               cpuid_mask(&entry->edx, CPUID_7_EDX);
+               /*
+                * We emulate ARCH_CAPABILITIES in software even
+                * if the host doesn't support it.
+                */
+               entry->edx |= F(ARCH_CAPABILITIES);
+               break;
+       default:
+               WARN_ON_ONCE(1);
+               entry->eax = 0;
+               entry->ebx = 0;
+               entry->ecx = 0;
+               entry->edx = 0;
+               break;
+       }
+}
+
+static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
+                                 int *nent, int maxnent)
 {
        int r;
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -327,12 +418,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        unsigned f_lm = 0;
 #endif
        unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-       unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
-       unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
        unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
-       unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
        unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
-       unsigned f_la57 = 0;
 
        /* cpuid 1.edx */
        const u32 kvm_cpuid_1_edx_x86_features =
@@ -377,7 +464,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        /* cpuid 0x80000008.ebx */
        const u32 kvm_cpuid_8000_0008_ebx_x86_features =
                F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
-               F(AMD_SSB_NO) | F(AMD_STIBP);
+               F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
 
        /* cpuid 0xC0000001.edx */
        const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -385,31 +472,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
                F(PMM) | F(PMM_EN);
 
-       /* cpuid 7.0.ebx */
-       const u32 kvm_cpuid_7_0_ebx_x86_features =
-               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-               F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
-               F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
-               F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
-               F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
-
        /* cpuid 0xD.1.eax */
        const u32 kvm_cpuid_D_1_eax_x86_features =
                F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
-       /* cpuid 7.0.ecx*/
-       const u32 kvm_cpuid_7_0_ecx_x86_features =
-               F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
-               F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
-               F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
-
-       /* cpuid 7.0.edx*/
-       const u32 kvm_cpuid_7_0_edx_x86_features =
-               F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-               F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-               F(MD_CLEAR);
-
        /* all calls to cpuid_count() should be made on the same cpu */
        get_cpu();
 
@@ -418,12 +484,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        if (*nent >= maxnent)
                goto out;
 
-       do_cpuid_1_ent(entry, function, index);
+       do_host_cpuid(entry, function, 0);
        ++*nent;
 
        switch (function) {
        case 0:
-               entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
+               /* Limited to the highest leaf implemented in KVM. */
+               entry->eax = min(entry->eax, 0x1fU);
                break;
        case 1:
                entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -441,14 +508,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 2: {
                int t, times = entry->eax & 0xff;
 
-               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
                entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
                for (t = 1; t < times; ++t) {
                        if (*nent >= maxnent)
                                goto out;
 
-                       do_cpuid_1_ent(&entry[t], function, 0);
-                       entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+                       do_host_cpuid(&entry[t], function, 0);
                        ++*nent;
                }
                break;
@@ -458,7 +523,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 0x8000001d: {
                int i, cache_type;
 
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until cache_type is zero */
                for (i = 1; ; ++i) {
                        if (*nent >= maxnent)
@@ -467,9 +531,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                        cache_type = entry[i - 1].eax & 0x1f;
                        if (!cache_type)
                                break;
-                       do_cpuid_1_ent(&entry[i], function, i);
-                       entry[i].flags |=
-                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       do_host_cpuid(&entry[i], function, i);
                        ++*nent;
                }
                break;
@@ -480,36 +542,21 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->ecx = 0;
                entry->edx = 0;
                break;
+       /* function 7 has additional index. */
        case 7: {
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               /* Mask ebx against host capability word 9 */
-               if (index == 0) {
-                       entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
-                       cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
-                       // TSC_ADJUST is emulated
-                       entry->ebx |= F(TSC_ADJUST);
-                       entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
-                       f_la57 = entry->ecx & F(LA57);
-                       cpuid_mask(&entry->ecx, CPUID_7_ECX);
-                       /* Set LA57 based on hardware capability. */
-                       entry->ecx |= f_la57;
-                       entry->ecx |= f_umip;
-                       /* PKU is not yet implemented for shadow paging. */
-                       if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-                               entry->ecx &= ~F(PKU);
-                       entry->edx &= kvm_cpuid_7_0_edx_x86_features;
-                       cpuid_mask(&entry->edx, CPUID_7_EDX);
-                       /*
-                        * We emulate ARCH_CAPABILITIES in software even
-                        * if the host doesn't support it.
-                        */
-                       entry->edx |= F(ARCH_CAPABILITIES);
-               } else {
-                       entry->ebx = 0;
-                       entry->ecx = 0;
-                       entry->edx = 0;
+               int i;
+
+               for (i = 0; ; ) {
+                       do_cpuid_7_mask(&entry[i], i);
+                       if (i == entry->eax)
+                               break;
+                       if (*nent >= maxnent)
+                               goto out;
+
+                       ++i;
+                       do_host_cpuid(&entry[i], function, i);
+                       ++*nent;
                }
-               entry->eax = 0;
                break;
        }
        case 9:
@@ -543,11 +590,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->edx = edx.full;
                break;
        }
-       /* function 0xb has additional index. */
+       /*
+        * Per Intel's SDM, the 0x1f is a superset of 0xb,
+        * thus they can be handled by common code.
+        */
+       case 0x1f:
        case 0xb: {
                int i, level_type;
 
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until level_type is zero */
                for (i = 1; ; ++i) {
                        if (*nent >= maxnent)
@@ -556,9 +606,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                        level_type = entry[i - 1].ecx & 0xff00;
                        if (!level_type)
                                break;
-                       do_cpuid_1_ent(&entry[i], function, i);
-                       entry[i].flags |=
-                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       do_host_cpuid(&entry[i], function, i);
                        ++*nent;
                }
                break;
@@ -571,7 +619,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->ebx = xstate_required_size(supported, false);
                entry->ecx = entry->ebx;
                entry->edx &= supported >> 32;
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                if (!supported)
                        break;
 
@@ -580,7 +627,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                        if (*nent >= maxnent)
                                goto out;
 
-                       do_cpuid_1_ent(&entry[i], function, idx);
+                       do_host_cpuid(&entry[i], function, idx);
                        if (idx == 1) {
                                entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
                                cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
@@ -597,8 +644,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                        }
                        entry[i].ecx = 0;
                        entry[i].edx = 0;
-                       entry[i].flags |=
-                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                        ++*nent;
                        ++i;
                }
@@ -611,12 +656,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                if (!f_intel_pt)
                        break;
 
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                for (t = 1; t <= times; ++t) {
                        if (*nent >= maxnent)
                                goto out;
-                       do_cpuid_1_ent(&entry[t], function, t);
-                       entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       do_host_cpuid(&entry[t], function, t);
                        ++*nent;
                }
                break;
@@ -640,7 +683,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                             (1 << KVM_FEATURE_PV_UNHALT) |
                             (1 << KVM_FEATURE_PV_TLB_FLUSH) |
                             (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
-                            (1 << KVM_FEATURE_PV_SEND_IPI);
+                            (1 << KVM_FEATURE_PV_SEND_IPI) |
+                            (1 << KVM_FEATURE_POLL_CONTROL) |
+                            (1 << KVM_FEATURE_PV_SCHED_YIELD);
 
                if (sched_info_on())
                        entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
@@ -730,21 +775,19 @@ out:
        return r;
 }
 
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
-                       u32 idx, int *nent, int maxnent, unsigned int type)
+static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
+                        int *nent, int maxnent, unsigned int type)
 {
        if (type == KVM_GET_EMULATED_CPUID)
-               return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+               return __do_cpuid_func_emulated(entry, func, nent, maxnent);
 
-       return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+       return __do_cpuid_func(entry, func, nent, maxnent);
 }
 
 #undef F
 
 struct kvm_cpuid_param {
        u32 func;
-       u32 idx;
-       bool has_leaf_count;
        bool (*qualifier)(const struct kvm_cpuid_param *param);
 };
 
@@ -788,11 +831,10 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
        int limit, nent = 0, r = -E2BIG, i;
        u32 func;
        static const struct kvm_cpuid_param param[] = {
-               { .func = 0, .has_leaf_count = true },
-               { .func = 0x80000000, .has_leaf_count = true },
-               { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
+               { .func = 0 },
+               { .func = 0x80000000 },
+               { .func = 0xC0000000, .qualifier = is_centaur_cpu },
                { .func = KVM_CPUID_SIGNATURE },
-               { .func = KVM_CPUID_FEATURES },
        };
 
        if (cpuid->nent < 1)
@@ -816,19 +858,16 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
                if (ent->qualifier && !ent->qualifier(ent))
                        continue;
 
-               r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
-                               &nent, cpuid->nent, type);
+               r = do_cpuid_func(&cpuid_entries[nent], ent->func,
+                                 &nent, cpuid->nent, type);
 
                if (r)
                        goto out_free;
 
-               if (!ent->has_leaf_count)
-                       continue;
-
                limit = cpuid_entries[nent - 1].eax;
                for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
-                       r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
-                                    &nent, cpuid->nent, type);
+                       r = do_cpuid_func(&cpuid_entries[nent], func,
+                                         &nent, cpuid->nent, type);
 
                if (r)
                        goto out_free;
index 4a387a23542484ac2d4ddabbc05d98f3ace6da42..8e409ad448f90aaccb2da83467b37ebc7a2c45a5 100644 (file)
@@ -4258,7 +4258,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
                ulong dr6;
 
                ctxt->ops->get_dr(ctxt, 6, &dr6);
-               dr6 &= ~15;
+               dr6 &= ~DR_TRAP_BITS;
                dr6 |= DR6_BD | DR6_RTM;
                ctxt->ops->set_dr(ctxt, 6, dr6);
                return emulate_db(ctxt);
index d6519a3aa959669a9adebf756152c35c2ede95ae..7c6233d37c6443cc096eedf20f11beeae380bd2d 100644 (file)
@@ -102,7 +102,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
        return mode != KVM_IRQCHIP_NONE;
 }
 
-bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
index 924b3bd5a7b7196148e861eb91cab9b768179415..8ecd48d31800a19236e9db8a7db284f65e4a29ad 100644 (file)
@@ -75,7 +75,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                        if (r < 0)
                                r = 0;
                        r += kvm_apic_set_irq(vcpu, irq, dest_map);
-               } else if (kvm_lapic_enabled(vcpu)) {
+               } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
                        if (!kvm_vector_hashing_enabled()) {
                                if (!lowest)
                                        lowest = vcpu;
index a21c440ff356ca1535375fdfe611716a843d5c49..42da7eb846aa2929cfc5de4c8d7321aa4a8fdec1 100644 (file)
@@ -69,6 +69,7 @@
 #define X2APIC_BROADCAST               0xFFFFFFFFul
 
 #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
+#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
 
@@ -85,11 +86,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
                apic_test_vector(vector, apic->regs + APIC_IRR);
 }
 
-static inline void apic_clear_vector(int vec, void *bitmap)
-{
-       clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
 static inline int __apic_test_and_set_vector(int vec, void *bitmap)
 {
        return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -443,12 +439,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 
        if (unlikely(vcpu->arch.apicv_active)) {
                /* need to update RVI */
-               apic_clear_vector(vec, apic->regs + APIC_IRR);
+               kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
                kvm_x86_ops->hwapic_irr_update(vcpu,
                                apic_find_highest_irr(apic));
        } else {
                apic->irr_pending = false;
-               apic_clear_vector(vec, apic->regs + APIC_IRR);
+               kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
                if (apic_search_irr(apic) != -1)
                        apic->irr_pending = true;
        }
@@ -1053,9 +1049,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
                if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
                        if (trig_mode)
-                               kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
+                               kvm_lapic_set_vector(vector,
+                                                    apic->regs + APIC_TMR);
                        else
-                               apic_clear_vector(vector, apic->regs + APIC_TMR);
+                               kvm_lapic_clear_vector(vector,
+                                                      apic->regs + APIC_TMR);
                }
 
                if (vcpu->arch.apicv_active)
@@ -1313,21 +1311,45 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
        return container_of(dev, struct kvm_lapic, dev);
 }
 
+#define APIC_REG_MASK(reg)     (1ull << ((reg) >> 4))
+#define APIC_REGS_MASK(first, count) \
+       (APIC_REG_MASK(first) * ((1ull << (count)) - 1))
+
 int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
                void *data)
 {
        unsigned char alignment = offset & 0xf;
        u32 result;
        /* this bitmask has a bit cleared for each reserved register */
-       static const u64 rmask = 0x43ff01ffffffe70cULL;
-
-       if ((alignment + len) > 4) {
-               apic_debug("KVM_APIC_READ: alignment error %x %d\n",
-                          offset, len);
-               return 1;
-       }
+       u64 valid_reg_mask =
+               APIC_REG_MASK(APIC_ID) |
+               APIC_REG_MASK(APIC_LVR) |
+               APIC_REG_MASK(APIC_TASKPRI) |
+               APIC_REG_MASK(APIC_PROCPRI) |
+               APIC_REG_MASK(APIC_LDR) |
+               APIC_REG_MASK(APIC_DFR) |
+               APIC_REG_MASK(APIC_SPIV) |
+               APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
+               APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
+               APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
+               APIC_REG_MASK(APIC_ESR) |
+               APIC_REG_MASK(APIC_ICR) |
+               APIC_REG_MASK(APIC_ICR2) |
+               APIC_REG_MASK(APIC_LVTT) |
+               APIC_REG_MASK(APIC_LVTTHMR) |
+               APIC_REG_MASK(APIC_LVTPC) |
+               APIC_REG_MASK(APIC_LVT0) |
+               APIC_REG_MASK(APIC_LVT1) |
+               APIC_REG_MASK(APIC_LVTERR) |
+               APIC_REG_MASK(APIC_TMICT) |
+               APIC_REG_MASK(APIC_TMCCT) |
+               APIC_REG_MASK(APIC_TDCR);
+
+       /* ARBPRI is not valid on x2APIC */
+       if (!apic_x2apic_mode(apic))
+               valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
 
-       if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+       if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) {
                apic_debug("KVM_APIC_READ: read reserved register %x\n",
                           offset);
                return 1;
@@ -1499,11 +1521,40 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
        }
 }
 
-void wait_lapic_expire(struct kvm_vcpu *vcpu)
+static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
+                                             s64 advance_expire_delta)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
        u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
-       u64 guest_tsc, tsc_deadline, ns;
+       u64 ns;
+
+       /* too early */
+       if (advance_expire_delta < 0) {
+               ns = -advance_expire_delta * 1000000ULL;
+               do_div(ns, vcpu->arch.virtual_tsc_khz);
+               timer_advance_ns -= min((u32)ns,
+                       timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+       } else {
+       /* too late */
+               ns = advance_expire_delta * 1000000ULL;
+               do_div(ns, vcpu->arch.virtual_tsc_khz);
+               timer_advance_ns += min((u32)ns,
+                       timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+       }
+
+       if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+               apic->lapic_timer.timer_advance_adjust_done = true;
+       if (unlikely(timer_advance_ns > 5000)) {
+               timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+               apic->lapic_timer.timer_advance_adjust_done = false;
+       }
+       apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+}
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       u64 guest_tsc, tsc_deadline;
 
        if (apic->lapic_timer.expired_tscdeadline == 0)
                return;
@@ -1514,34 +1565,15 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
        tsc_deadline = apic->lapic_timer.expired_tscdeadline;
        apic->lapic_timer.expired_tscdeadline = 0;
        guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-       trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+       apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
 
        if (guest_tsc < tsc_deadline)
                __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
 
-       if (!apic->lapic_timer.timer_advance_adjust_done) {
-               /* too early */
-               if (guest_tsc < tsc_deadline) {
-                       ns = (tsc_deadline - guest_tsc) * 1000000ULL;
-                       do_div(ns, vcpu->arch.virtual_tsc_khz);
-                       timer_advance_ns -= min((u32)ns,
-                               timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
-               } else {
-               /* too late */
-                       ns = (guest_tsc - tsc_deadline) * 1000000ULL;
-                       do_div(ns, vcpu->arch.virtual_tsc_khz);
-                       timer_advance_ns += min((u32)ns,
-                               timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
-               }
-               if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
-                       apic->lapic_timer.timer_advance_adjust_done = true;
-               if (unlikely(timer_advance_ns > 5000)) {
-                       timer_advance_ns = 0;
-                       apic->lapic_timer.timer_advance_adjust_done = true;
-               }
-               apic->lapic_timer.timer_advance_ns = timer_advance_ns;
-       }
+       if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+               adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
 }
+EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
 {
@@ -2014,7 +2046,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
                apic_debug("%s: offset 0x%x with length 0x%x, and value is "
                           "0x%x\n", __func__, offset, len, val);
 
-       kvm_lapic_reg_write(apic, offset & 0xff0, val);
+       kvm_lapic_reg_write(apic, offset, val);
 
        return 0;
 }
@@ -2311,7 +2343,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
                     HRTIMER_MODE_ABS_PINNED);
        apic->lapic_timer.timer.function = apic_timer_fn;
        if (timer_advance_ns == -1) {
-               apic->lapic_timer.timer_advance_ns = 1000;
+               apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
                apic->lapic_timer.timer_advance_adjust_done = false;
        } else {
                apic->lapic_timer.timer_advance_ns = timer_advance_ns;
@@ -2321,7 +2353,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
 
        /*
         * APIC is created enabled. This will prevent kvm_lapic_set_base from
-        * thinking that APIC satet has changed.
+        * thinking that APIC state has changed.
         */
        vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
        static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
@@ -2330,6 +2362,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
        return 0;
 nomem_free_apic:
        kfree(apic);
+       vcpu->arch.apic = NULL;
 nomem:
        return -ENOMEM;
 }
index d6d049ba304526be2974b2c4228b3a70420adbe7..36747174e4a8ba7b19d5fb58ccfae2974f8e4795 100644 (file)
@@ -32,6 +32,7 @@ struct kvm_timer {
        u64 tscdeadline;
        u64 expired_tscdeadline;
        u32 timer_advance_ns;
+       s64 advance_expire_delta;
        atomic_t pending;                       /* accumulated triggered timers */
        bool hv_timer_in_use;
        bool timer_advance_adjust_done;
@@ -129,6 +130,11 @@ void kvm_lapic_exit(void);
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
 
+static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
+{
+       clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
 static inline void kvm_lapic_set_vector(int vec, void *bitmap)
 {
        set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -219,7 +225,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
-void wait_lapic_expire(struct kvm_vcpu *vcpu);
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu);
index 98f6e4f88b04cef8fd6896d3f81b4ce8c3a76395..15d2c06d7fece21bf72bcdd64334b78712b46569 100644 (file)
@@ -140,9 +140,6 @@ module_param(dbg, bool, 0644);
 
 #include <trace/events/kvm.h>
 
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
 #define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 
@@ -259,11 +256,20 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
  */
 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+static u8 __read_mostly shadow_phys_bits;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
+static bool is_executable_pte(u64 spte);
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
 
 static inline bool kvm_available_flush_tlb_with_range(void)
 {
@@ -468,6 +474,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+static u8 kvm_get_shadow_phys_bits(void)
+{
+       /*
+        * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+        * in CPU detection code, but MKTME treats those reduced bits as
+        * 'keyID' thus they are not reserved bits. Therefore for MKTME
+        * we should still return physical address bits reported by CPUID.
+        */
+       if (!boot_cpu_has(X86_FEATURE_TME) ||
+           WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+               return boot_cpu_data.x86_phys_bits;
+
+       return cpuid_eax(0x80000008) & 0xff;
+}
+
 static void kvm_mmu_reset_all_pte_masks(void)
 {
        u8 low_phys_bits;
@@ -481,6 +502,8 @@ static void kvm_mmu_reset_all_pte_masks(void)
        shadow_present_mask = 0;
        shadow_acc_track_mask = 0;
 
+       shadow_phys_bits = kvm_get_shadow_phys_bits();
+
        /*
         * If the CPU has 46 or less physical address bits, then set an
         * appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -1073,10 +1096,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 
 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 {
-       if (sp->role.direct)
-               BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
-       else
+       if (!sp->role.direct) {
                sp->gfns[index] = gfn;
+               return;
+       }
+
+       if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+               pr_err_ratelimited("gfn mismatch under direct page %llx "
+                                  "(expected %llx, got %llx)\n",
+                                  sp->gfn,
+                                  kvm_mmu_page_get_gfn(sp, index), gfn);
 }
 
 /*
@@ -3055,10 +3084,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
                ret = RET_PF_EMULATE;
 
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-       pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
-                is_large_pte(*sptep)? "2MB" : "4kB",
-                *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
-                *sptep, sptep);
+       trace_kvm_mmu_set_spte(level, gfn, sptep);
        if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;
 
@@ -3070,8 +3096,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
                }
        }
 
-       kvm_release_pfn_clean(pfn);
-
        return ret;
 }
 
@@ -3106,9 +3130,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
        if (ret <= 0)
                return -1;
 
-       for (i = 0; i < ret; i++, gfn++, start++)
+       for (i = 0; i < ret; i++, gfn++, start++) {
                mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
                             page_to_pfn(pages[i]), true, true);
+               put_page(pages[i]);
+       }
 
        return 0;
 }
@@ -3156,40 +3182,40 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
        __direct_pte_prefetch(vcpu, sp, sptep);
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
-                       int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+                       int map_writable, int level, kvm_pfn_t pfn,
+                       bool prefault)
 {
-       struct kvm_shadow_walk_iterator iterator;
+       struct kvm_shadow_walk_iterator it;
        struct kvm_mmu_page *sp;
-       int emulate = 0;
-       gfn_t pseudo_gfn;
+       int ret;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       gfn_t base_gfn = gfn;
 
        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               return 0;
+               return RET_PF_RETRY;
 
-       for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
-               if (iterator.level == level) {
-                       emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
-                                              write, level, gfn, pfn, prefault,
-                                              map_writable);
-                       direct_pte_prefetch(vcpu, iterator.sptep);
-                       ++vcpu->stat.pf_fixed;
+       trace_kvm_mmu_spte_requested(gpa, level, pfn);
+       for_each_shadow_entry(vcpu, gpa, it) {
+               base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+               if (it.level == level)
                        break;
-               }
 
-               drop_large_spte(vcpu, iterator.sptep);
-               if (!is_shadow_present_pte(*iterator.sptep)) {
-                       u64 base_addr = iterator.addr;
+               drop_large_spte(vcpu, it.sptep);
+               if (!is_shadow_present_pte(*it.sptep)) {
+                       sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+                                             it.level - 1, true, ACC_ALL);
 
-                       base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
-                       pseudo_gfn = base_addr >> PAGE_SHIFT;
-                       sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
-                                             iterator.level - 1, 1, ACC_ALL);
-
-                       link_shadow_page(vcpu, iterator.sptep, sp);
+                       link_shadow_page(vcpu, it.sptep, sp);
                }
        }
-       return emulate;
+
+       ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+                          write, level, base_gfn, pfn, prefault,
+                          map_writable);
+       direct_pte_prefetch(vcpu, it.sptep);
+       ++vcpu->stat.pf_fixed;
+       return ret;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -3216,11 +3242,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                       gfn_t *gfnp, kvm_pfn_t *pfnp,
+                                       gfn_t gfn, kvm_pfn_t *pfnp,
                                        int *levelp)
 {
        kvm_pfn_t pfn = *pfnp;
-       gfn_t gfn = *gfnp;
        int level = *levelp;
 
        /*
@@ -3247,8 +3272,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
                mask = KVM_PAGES_PER_HPAGE(level) - 1;
                VM_BUG_ON((gfn & mask) != (pfn & mask));
                if (pfn & mask) {
-                       gfn &= ~mask;
-                       *gfnp = gfn;
                        kvm_release_pfn_clean(pfn);
                        pfn &= ~mask;
                        kvm_get_pfn(pfn);
@@ -3505,22 +3528,19 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
        if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
                return r;
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-       r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
-
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+       r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -4015,19 +4035,6 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
        return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(!lapic_in_kernel(vcpu) ||
-                    kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
-               return false;
-
-       if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
-               return false;
-
-       return kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
@@ -4147,22 +4154,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
                return r;
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-       r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
-
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4494,7 +4498,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
         */
        shadow_zero_check = &context->shadow_zero_check;
        __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
-                               boot_cpu_data.x86_phys_bits,
+                               shadow_phys_bits,
                                context->shadow_root_level, uses_nx,
                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
                                is_pse(vcpu), true);
@@ -4531,13 +4535,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 
        if (boot_cpu_is_amd())
                __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
-                                       boot_cpu_data.x86_phys_bits,
+                                       shadow_phys_bits,
                                        context->shadow_root_level, false,
                                        boot_cpu_has(X86_FEATURE_GBPAGES),
                                        true, true);
        else
                __reset_rsvds_bits_mask_ept(shadow_zero_check,
-                                           boot_cpu_data.x86_phys_bits,
+                                           shadow_phys_bits,
                                            false);
 
        if (!shadow_me_mask)
@@ -4558,7 +4562,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
                                struct kvm_mmu *context, bool execonly)
 {
        __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
-                                   boot_cpu_data.x86_phys_bits, execonly);
+                                   shadow_phys_bits, execonly);
 }
 
 #define BYTE_MASK(access) \
@@ -5935,7 +5939,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        int nr_to_scan = sc->nr_to_scan;
        unsigned long freed = 0;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int idx;
@@ -5977,7 +5981,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
                break;
        }
 
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return freed;
 }
 
@@ -5999,6 +6003,34 @@ static void mmu_destroy_caches(void)
        kmem_cache_destroy(mmu_page_header_cache);
 }
 
+static void kvm_set_mmio_spte_mask(void)
+{
+       u64 mask;
+
+       /*
+        * Set the reserved bits and the present bit of an paging-structure
+        * entry to generate page fault with PFER.RSV = 1.
+        */
+
+       /*
+        * Mask the uppermost physical address bit, which would be reserved as
+        * long as the supported physical address width is less than 52.
+        */
+       mask = 1ull << 51;
+
+       /* Set the present bit. */
+       mask |= 1ull;
+
+       /*
+        * If reserved bit is not supported, clear the present bit to disable
+        * mmio page fault.
+        */
+       if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+               mask &= ~1ull;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
 int kvm_mmu_module_init(void)
 {
        int ret = -ENOMEM;
@@ -6015,6 +6047,8 @@ int kvm_mmu_module_init(void)
 
        kvm_mmu_reset_all_pte_masks();
 
+       kvm_set_mmio_spte_mask();
+
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                            sizeof(struct pte_list_desc),
                                            0, SLAB_ACCOUNT, NULL);
index dd30dccd2ad5e250aef10e889e150011fece3468..d8001b4bca054a82655b60b5621a6e017711e0b0 100644 (file)
@@ -301,6 +301,65 @@ TRACE_EVENT(
                  __entry->kvm_gen == __entry->spte_gen
        )
 );
+
+TRACE_EVENT(
+       kvm_mmu_set_spte,
+       TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+       TP_ARGS(level, gfn, sptep),
+
+       TP_STRUCT__entry(
+               __field(u64, gfn)
+               __field(u64, spte)
+               __field(u64, sptep)
+               __field(u8, level)
+               /* These depend on page entry type, so compute them now.  */
+               __field(bool, r)
+               __field(bool, x)
+               __field(u8, u)
+       ),
+
+       TP_fast_assign(
+               __entry->gfn = gfn;
+               __entry->spte = *sptep;
+               __entry->sptep = virt_to_phys(sptep);
+               __entry->level = level;
+               __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+               __entry->x = is_executable_pte(__entry->spte);
+               __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+       ),
+
+       TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+                 __entry->gfn, __entry->spte,
+                 __entry->r ? "r" : "-",
+                 __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
+                 __entry->x ? "x" : "-",
+                 __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+                 __entry->level, __entry->sptep
+       )
+);
+
+TRACE_EVENT(
+       kvm_mmu_spte_requested,
+       TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
+       TP_ARGS(addr, level, pfn),
+
+       TP_STRUCT__entry(
+               __field(u64, gfn)
+               __field(u64, pfn)
+               __field(u8, level)
+       ),
+
+       TP_fast_assign(
+               __entry->gfn = addr >> PAGE_SHIFT;
+               __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+               __entry->level = level;
+       ),
+
+       TP_printk("gfn %llx pfn %llx level %d",
+                 __entry->gfn, __entry->pfn, __entry->level
+       )
+);
+
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
index d583bcd119fc7544c95bd463a96a28622bac368e..7d5cdb3af59435c4dd7fb1287c7f8873c563d24e 100644 (file)
@@ -540,6 +540,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
        mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
                     true, true);
 
+       kvm_release_pfn_clean(pfn);
        return true;
 }
 
@@ -619,6 +620,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        struct kvm_shadow_walk_iterator it;
        unsigned direct_access, access = gw->pt_access;
        int top_level, ret;
+       gfn_t base_gfn;
 
        direct_access = gw->pte_access;
 
@@ -663,35 +665,34 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                        link_shadow_page(vcpu, it.sptep, sp);
        }
 
-       for (;
-            shadow_walk_okay(&it) && it.level > hlevel;
-            shadow_walk_next(&it)) {
-               gfn_t direct_gfn;
+       base_gfn = gw->gfn;
+
+       trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
 
+       for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
                clear_sp_write_flooding_count(it.sptep);
+               base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+               if (it.level == hlevel)
+                       break;
+
                validate_direct_spte(vcpu, it.sptep, direct_access);
 
                drop_large_spte(vcpu, it.sptep);
 
-               if (is_shadow_present_pte(*it.sptep))
-                       continue;
-
-               direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
-
-               sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
-                                     true, direct_access);
-               link_shadow_page(vcpu, it.sptep, sp);
+               if (!is_shadow_present_pte(*it.sptep)) {
+                       sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+                                             it.level - 1, true, direct_access);
+                       link_shadow_page(vcpu, it.sptep, sp);
+               }
        }
 
-       clear_sp_write_flooding_count(it.sptep);
        ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
-                          it.level, gw->gfn, pfn, prefault, map_writable);
+                          it.level, base_gfn, pfn, prefault, map_writable);
        FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-
+       ++vcpu->stat.pf_fixed;
        return ret;
 
 out_gpte_changed:
-       kvm_release_pfn_clean(pfn);
        return RET_PF_RETRY;
 }
 
@@ -839,6 +840,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                        walker.pte_access &= ~ACC_EXEC_MASK;
        }
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
@@ -847,19 +849,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (!force_pt_level)
-               transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
+               transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
        r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
                         level, pfn, map_writable, prefault);
-       ++vcpu->stat.pf_fixed;
        kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
 
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
index 132d149494d6ee2a83666178dd9c36688087be84..6f7e0ed4d848ecb5768816449e4cccd172cef0d3 100644 (file)
@@ -19,6 +19,9 @@
 #include "lapic.h"
 #include "pmu.h"
 
+/* This keeps the total size of the filter under 4k. */
+#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63
+
 /* NOTE:
  * - Each perf counter is defined as "struct kvm_pmc";
  * - There are two types of perf counters: general purpose (gp) and fixed.
@@ -141,6 +144,10 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 {
        unsigned config, type = PERF_TYPE_RAW;
        u8 event_select, unit_mask;
+       struct kvm *kvm = pmc->vcpu->kvm;
+       struct kvm_pmu_event_filter *filter;
+       int i;
+       bool allow_event = true;
 
        if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
                printk_once("kvm pmu: pin control bit is ignored\n");
@@ -152,6 +159,22 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
                return;
 
+       filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
+       if (filter) {
+               for (i = 0; i < filter->nevents; i++)
+                       if (filter->events[i] ==
+                           (eventsel & AMD64_RAW_EVENT_MASK_NB))
+                               break;
+               if (filter->action == KVM_PMU_EVENT_ALLOW &&
+                   i == filter->nevents)
+                       allow_event = false;
+               if (filter->action == KVM_PMU_EVENT_DENY &&
+                   i < filter->nevents)
+                       allow_event = false;
+       }
+       if (!allow_event)
+               return;
+
        event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
        unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 
@@ -348,3 +371,43 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 {
        kvm_pmu_reset(vcpu);
 }
+
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
+{
+       struct kvm_pmu_event_filter tmp, *filter;
+       size_t size;
+       int r;
+
+       if (copy_from_user(&tmp, argp, sizeof(tmp)))
+               return -EFAULT;
+
+       if (tmp.action != KVM_PMU_EVENT_ALLOW &&
+           tmp.action != KVM_PMU_EVENT_DENY)
+               return -EINVAL;
+
+       if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
+               return -E2BIG;
+
+       size = struct_size(filter, events, tmp.nevents);
+       filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
+       if (!filter)
+               return -ENOMEM;
+
+       r = -EFAULT;
+       if (copy_from_user(filter, argp, size))
+               goto cleanup;
+
+       /* Ensure nevents can't be changed between the user copies. */
+       *filter = tmp;
+
+       mutex_lock(&kvm->lock);
+       rcu_swap_protected(kvm->arch.pmu_event_filter, filter,
+                          mutex_is_locked(&kvm->lock));
+       mutex_unlock(&kvm->lock);
+
+       synchronize_srcu_expedited(&kvm->srcu);
+       r = 0;
+cleanup:
+       kfree(filter);
+       return r;
+}
index 22dff661145a1bcbf5996b7cb0c1fba660a5859e..58265f761c3bc95466a5ac1d7558c903281394ef 100644 (file)
@@ -118,6 +118,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
 void kvm_pmu_reset(struct kvm_vcpu *vcpu);
 void kvm_pmu_init(struct kvm_vcpu *vcpu);
 void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
 
 bool is_vmware_backdoor_pmc(u32 pmc_idx);
 
index 48c865a4e5dd16c445411c808930c28f2bfec97e..583b9fa656f3f8594165cc44e96c8b2741079cb1 100644 (file)
@@ -364,6 +364,10 @@ static int avic;
 module_param(avic, int, S_IRUGO);
 #endif
 
+/* enable/disable Next RIP Save */
+static int nrips = true;
+module_param(nrips, int, 0444);
+
 /* enable/disable Virtual VMLOAD VMSAVE */
 static int vls = true;
 module_param(vls, int, 0444);
@@ -770,7 +774,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (svm->vmcb->control.next_rip != 0) {
+       if (nrips && svm->vmcb->control.next_rip != 0) {
                WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
                svm->next_rip = svm->vmcb->control.next_rip;
        }
@@ -807,7 +811,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
 
        kvm_deliver_exception_payload(&svm->vcpu);
 
-       if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
+       if (nr == BP_VECTOR && !nrips) {
                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 
                /*
@@ -1364,6 +1368,11 @@ static __init int svm_hardware_setup(void)
        } else
                kvm_disable_tdp();
 
+       if (nrips) {
+               if (!boot_cpu_has(X86_FEATURE_NRIPS))
+                       nrips = false;
+       }
+
        if (avic) {
                if (!npt_enabled ||
                    !boot_cpu_has(X86_FEATURE_AVIC) ||
@@ -3290,7 +3299,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
                                       vmcb->control.exit_int_info_err,
                                       KVM_ISA_SVM);
 
-       rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
        if (rc) {
                if (rc == -EINVAL)
                        kvm_inject_gp(&svm->vcpu, 0);
@@ -3580,7 +3589,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
        vmcb_gpa = svm->vmcb->save.rax;
 
-       rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
        if (rc) {
                if (rc == -EINVAL)
                        kvm_inject_gp(&svm->vcpu, 0);
@@ -3935,7 +3944,7 @@ static int rdpmc_interception(struct vcpu_svm *svm)
 {
        int err;
 
-       if (!static_cpu_has(X86_FEATURE_NRIPS))
+       if (!nrips)
                return emulate_on_interception(svm);
 
        err = kvm_rdpmc(&svm->vcpu);
@@ -5160,10 +5169,13 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
        kvm_lapic_set_irr(vec, vcpu->arch.apic);
        smp_mb__after_atomic();
 
-       if (avic_vcpu_is_running(vcpu))
-               wrmsrl(SVM_AVIC_DOORBELL,
-                      kvm_cpu_get_apicid(vcpu->cpu));
-       else
+       if (avic_vcpu_is_running(vcpu)) {
+               int cpuid = vcpu->cpu;
+
+               if (cpuid != get_cpu())
+                       wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
+               put_cpu();
+       } else
                kvm_vcpu_wake_up(vcpu);
 }
 
@@ -5640,6 +5652,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        clgi();
        kvm_load_guest_xcr0(vcpu);
 
+       if (lapic_in_kernel(vcpu) &&
+               vcpu->arch.apic->lapic_timer.timer_advance_ns)
+               kvm_wait_lapic_expire(vcpu);
+
        /*
         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
         * it's non-zero. Since vmentry is serialising on affected CPUs, there
@@ -5861,9 +5877,9 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xd9;
 }
 
-static void svm_check_processor_compat(void *rtn)
+static int __init svm_check_processor_compat(void)
 {
-       *(int *)rtn = 0;
+       return 0;
 }
 
 static bool svm_cpu_has_accelerated_tpr(void)
@@ -5875,6 +5891,7 @@ static bool svm_has_emulated_msr(int index)
 {
        switch (index) {
        case MSR_IA32_MCG_EXT_CTL:
+       case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                return false;
        default:
                break;
@@ -6162,15 +6179,9 @@ out:
        return ret;
 }
 
-static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
-       local_irq_enable();
-       /*
-        * We must have an instruction with interrupts enabled, so
-        * the timer interrupt isn't delayed by the interrupt shadow.
-        */
-       asm("nop");
-       local_irq_disable();
+
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -7256,7 +7267,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .set_tdp_cr3 = set_tdp_cr3,
 
        .check_intercept = svm_check_intercept,
-       .handle_external_intr = svm_handle_external_intr,
+       .handle_exit_irqoff = svm_handle_exit_irqoff,
 
        .request_immediate_exit = __kvm_request_immediate_exit,
 
index 4d47a2631d1fb46d9f913b59743cb5417d7401c6..b5c831e79094d40467cf065a7056a3666e278237 100644 (file)
@@ -1365,7 +1365,7 @@ TRACE_EVENT(kvm_hv_timer_state,
                        __entry->vcpu_id = vcpu_id;
                        __entry->hv_timer_in_use = hv_timer_in_use;
                        ),
-               TP_printk("vcpu_id %x hv_timer %x\n",
+               TP_printk("vcpu_id %x hv_timer %x",
                        __entry->vcpu_id,
                        __entry->hv_timer_in_use)
 );
index 5466c6d85cf3ef07388e47012b88fb478ac0d5c2..72359709cdc1741beff8656b20f0432895388717 100644 (file)
@@ -3,6 +3,7 @@
 #include <linux/errno.h>
 #include <linux/smp.h>
 
+#include "../hyperv.h"
 #include "evmcs.h"
 #include "vmcs.h"
 #include "vmx.h"
@@ -313,6 +314,23 @@ void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 }
 #endif
 
+bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
+{
+       struct hv_vp_assist_page assist_page;
+
+       *evmcs_gpa = -1ull;
+
+       if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
+               return false;
+
+       if (unlikely(!assist_page.enlighten_vmentry))
+               return false;
+
+       *evmcs_gpa = assist_page.current_nested_vmcs;
+
+       return true;
+}
+
 uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
index e0fcef85b3329478e0ded0188b5fff8b93ca8388..39a24eec88847df83a655b240d90fddde023990b 100644 (file)
@@ -195,6 +195,7 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
 static inline void evmcs_touch_msr_bitmap(void) {}
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
+bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
 uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
 int nested_enable_evmcs(struct kvm_vcpu *vcpu,
                        uint16_t *vmcs_version);
index 5f9c1a200201c90c595c240a55389e8fa35e87bb..6e88f459b323960f990467b59de9f4541feddbfe 100644 (file)
@@ -41,15 +41,19 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
 
-static u16 shadow_read_only_fields[] = {
-#define SHADOW_FIELD_RO(x) x,
+struct shadow_vmcs_field {
+       u16     encoding;
+       u16     offset;
+};
+static struct shadow_vmcs_field shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
 #include "vmcs_shadow_fields.h"
 };
 static int max_shadow_read_only_fields =
        ARRAY_SIZE(shadow_read_only_fields);
 
-static u16 shadow_read_write_fields[] = {
-#define SHADOW_FIELD_RW(x) x,
+static struct shadow_vmcs_field shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
 #include "vmcs_shadow_fields.h"
 };
 static int max_shadow_read_write_fields =
@@ -63,34 +67,40 @@ static void init_vmcs_shadow_fields(void)
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
        for (i = j = 0; i < max_shadow_read_only_fields; i++) {
-               u16 field = shadow_read_only_fields[i];
+               struct shadow_vmcs_field entry = shadow_read_only_fields[i];
+               u16 field = entry.encoding;
 
                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
                    (i + 1 == max_shadow_read_only_fields ||
-                    shadow_read_only_fields[i + 1] != field + 1))
+                    shadow_read_only_fields[i + 1].encoding != field + 1))
                        pr_err("Missing field from shadow_read_only_field %x\n",
                               field + 1);
 
                clear_bit(field, vmx_vmread_bitmap);
-#ifdef CONFIG_X86_64
                if (field & 1)
+#ifdef CONFIG_X86_64
                        continue;
+#else
+                       entry.offset += sizeof(u32);
 #endif
-               if (j < i)
-                       shadow_read_only_fields[j] = field;
-               j++;
+               shadow_read_only_fields[j++] = entry;
        }
        max_shadow_read_only_fields = j;
 
        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
-               u16 field = shadow_read_write_fields[i];
+               struct shadow_vmcs_field entry = shadow_read_write_fields[i];
+               u16 field = entry.encoding;
 
                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
                    (i + 1 == max_shadow_read_write_fields ||
-                    shadow_read_write_fields[i + 1] != field + 1))
+                    shadow_read_write_fields[i + 1].encoding != field + 1))
                        pr_err("Missing field from shadow_read_write_field %x\n",
                               field + 1);
 
+               WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
+                         field <= GUEST_TR_AR_BYTES,
+                         "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
+
                /*
                 * PML and the preemption timer can be emulated, but the
                 * processor cannot vmwrite to fields that don't exist
@@ -115,13 +125,13 @@ static void init_vmcs_shadow_fields(void)
 
                clear_bit(field, vmx_vmwrite_bitmap);
                clear_bit(field, vmx_vmread_bitmap);
-#ifdef CONFIG_X86_64
                if (field & 1)
+#ifdef CONFIG_X86_64
                        continue;
+#else
+                       entry.offset += sizeof(u32);
 #endif
-               if (j < i)
-                       shadow_read_write_fields[j] = field;
-               j++;
+               shadow_read_write_fields[j++] = entry;
        }
        max_shadow_read_write_fields = j;
 }
@@ -182,7 +192,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 
 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 {
-       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+       secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 }
 
@@ -238,22 +248,41 @@ static void free_nested(struct kvm_vcpu *vcpu)
        free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+                                    struct loaded_vmcs *prev)
+{
+       struct vmcs_host_state *dest, *src;
+
+       if (unlikely(!vmx->guest_state_loaded))
+               return;
+
+       src = &prev->host_state;
+       dest = &vmx->loaded_vmcs->host_state;
+
+       vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+       dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+       dest->ds_sel = src->ds_sel;
+       dest->es_sel = src->es_sel;
+#endif
+}
+
 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct loaded_vmcs *prev;
        int cpu;
 
        if (vmx->loaded_vmcs == vmcs)
                return;
 
        cpu = get_cpu();
-       vmx_vcpu_put(vcpu);
+       prev = vmx->loaded_vmcs;
        vmx->loaded_vmcs = vmcs;
-       vmx_vcpu_load(vcpu, cpu);
+       vmx_vcpu_load_vmcs(vcpu, cpu);
+       vmx_sync_vmcs_host_state(vmx, prev);
        put_cpu();
 
-       vm_entry_controls_reset_shadow(vmx);
-       vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
 }
 
@@ -930,8 +959,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
                 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
                 * must not be dereferenced.
                 */
-               if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
-                   !nested_ept) {
+               if (is_pae_paging(vcpu) && !nested_ept) {
                        if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
                                *entry_failure_code = ENTRY_FAIL_PDPTE;
                                return -EINVAL;
@@ -1105,14 +1133,6 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
        vmx->nested.msrs.misc_low = data;
        vmx->nested.msrs.misc_high = data >> 32;
 
-       /*
-        * If L1 has read-only VM-exit information fields, use the
-        * less permissive vmx_vmwrite_bitmap to specify write
-        * permissions for the shadow VMCS.
-        */
-       if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
-               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
-
        return 0;
 }
 
@@ -1214,6 +1234,11 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        case MSR_IA32_VMX_VMCS_ENUM:
                vmx->nested.msrs.vmcs_enum = data;
                return 0;
+       case MSR_IA32_VMX_VMFUNC:
+               if (data & ~vmx->nested.msrs.vmfunc_controls)
+                       return -EINVAL;
+               vmx->nested.msrs.vmfunc_controls = data;
+               return 0;
        default:
                /*
                 * The rest of the VMX capability MSRs do not support restore.
@@ -1301,41 +1326,29 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
 }
 
 /*
- * Copy the writable VMCS shadow fields back to the VMCS12, in case
- * they have been modified by the L1 guest. Note that the "read-only"
- * VM-exit information fields are actually writable if the vCPU is
- * configured to support "VMWRITE to any supported field in the VMCS."
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
+ * been modified by the L1 guest.  Note, "writable" in this context means
+ * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
+ * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
+ * VM-exit information fields (which are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS").
  */
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 {
-       const u16 *fields[] = {
-               shadow_read_write_fields,
-               shadow_read_only_fields
-       };
-       const int max_fields[] = {
-               max_shadow_read_write_fields,
-               max_shadow_read_only_fields
-       };
-       int i, q;
-       unsigned long field;
-       u64 field_value;
        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+       struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+       struct shadow_vmcs_field field;
+       unsigned long val;
+       int i;
 
        preempt_disable();
 
        vmcs_load(shadow_vmcs);
 
-       for (q = 0; q < ARRAY_SIZE(fields); q++) {
-               for (i = 0; i < max_fields[q]; i++) {
-                       field = fields[q][i];
-                       field_value = __vmcs_readl(field);
-                       vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
-               }
-               /*
-                * Skip the VM-exit information fields if they are read-only.
-                */
-               if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
-                       break;
+       for (i = 0; i < max_shadow_read_write_fields; i++) {
+               field = shadow_read_write_fields[i];
+               val = __vmcs_readl(field.encoding);
+               vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
        }
 
        vmcs_clear(shadow_vmcs);
@@ -1346,7 +1359,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-       const u16 *fields[] = {
+       const struct shadow_vmcs_field *fields[] = {
                shadow_read_write_fields,
                shadow_read_only_fields
        };
@@ -1354,18 +1367,20 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
                max_shadow_read_write_fields,
                max_shadow_read_only_fields
        };
-       int i, q;
-       unsigned long field;
-       u64 field_value = 0;
        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+       struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+       struct shadow_vmcs_field field;
+       unsigned long val;
+       int i, q;
 
        vmcs_load(shadow_vmcs);
 
        for (q = 0; q < ARRAY_SIZE(fields); q++) {
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
-                       vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
-                       __vmcs_writel(field, field_value);
+                       val = vmcs12_read_any(vmcs12, field.encoding,
+                                             field.offset);
+                       __vmcs_writel(field.encoding, val);
                }
        }
 
@@ -1623,7 +1638,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
         * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
         * evmcs->host_idtr_base = vmcs12->host_idtr_base;
         * evmcs->host_rsp = vmcs12->host_rsp;
-        * sync_vmcs12() doesn't read these:
+        * sync_vmcs02_to_vmcs12() doesn't read these:
         * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
         * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
         * evmcs->msr_bitmap = vmcs12->msr_bitmap;
@@ -1768,26 +1783,22 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
                                                 bool from_launch)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct hv_vp_assist_page assist_page;
+       bool evmcs_gpa_changed = false;
+       u64 evmcs_gpa;
 
        if (likely(!vmx->nested.enlightened_vmcs_enabled))
                return 1;
 
-       if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
-               return 1;
-
-       if (unlikely(!assist_page.enlighten_vmentry))
+       if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
                return 1;
 
-       if (unlikely(assist_page.current_nested_vmcs !=
-                    vmx->nested.hv_evmcs_vmptr)) {
-
+       if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
                if (!vmx->nested.hv_evmcs)
                        vmx->nested.current_vmptr = -1ull;
 
                nested_release_evmcs(vcpu);
 
-               if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
+               if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
                                 &vmx->nested.hv_evmcs_map))
                        return 0;
 
@@ -1822,15 +1833,9 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
                }
 
                vmx->nested.dirty_vmcs12 = true;
-               /*
-                * As we keep L2 state for one guest only 'hv_clean_fields' mask
-                * can't be used when we switch between them. Reset it here for
-                * simplicity.
-                */
-               vmx->nested.hv_evmcs->hv_clean_fields &=
-                       ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-               vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+               vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
 
+               evmcs_gpa_changed = true;
                /*
                 * Unlike normal vmcs12, enlightened vmcs12 is not fully
                 * reloaded from guest's memory (read only fields, fields not
@@ -1844,10 +1849,19 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
                }
 
        }
+
+       /*
+        * Clean fields data can't de used on VMLAUNCH and when we switch
+        * between different L2 guests as KVM keeps a single VMCS12 per L1.
+        */
+       if (from_launch || evmcs_gpa_changed)
+               vmx->nested.hv_evmcs->hv_clean_fields &=
+                       ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
        return 1;
 }
 
-void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
@@ -1868,7 +1882,7 @@ void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
                copy_vmcs12_to_shadow(vmx);
        }
 
-       vmx->nested.need_vmcs12_sync = false;
+       vmx->nested.need_vmcs12_to_shadow_sync = false;
 }
 
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -1948,8 +1962,20 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
        if (cpu_has_vmx_msr_bitmap())
                vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 
-       if (enable_pml)
+       /*
+        * The PML address never changes, so it is constant in vmcs02.
+        * Conceptually we want to copy the PML index from vmcs01 here,
+        * and then back to vmcs01 on nested vmexit.  But since we flush
+        * the log and reset GUEST_PML_INDEX on each vmexit, the PML
+        * index is also effectively constant in vmcs02.
+        */
+       if (enable_pml) {
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+       }
+
+       if (cpu_has_vmx_encls_vmexit())
+               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
 
        /*
         * Set the MSR load/store lists to match L0's settings.  Only the
@@ -1963,7 +1989,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
        vmx_set_constant_host_state(vmx);
 }
 
-static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
                                      struct vmcs12 *vmcs12)
 {
        prepare_vmcs02_constant_state(vmx);
@@ -1984,17 +2010,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
        u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
 
        if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
-               prepare_vmcs02_early_full(vmx, vmcs12);
+               prepare_vmcs02_early_rare(vmx, vmcs12);
 
        /*
         * PIN CONTROLS
         */
-       exec_control = vmcs12->pin_based_vm_exec_control;
-
-       /* Preemption timer setting is computed directly in vmx_vcpu_run.  */
-       exec_control |= vmcs_config.pin_based_exec_ctrl;
-       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-       vmx->loaded_vmcs->hv_timer_armed = false;
+       exec_control = vmx_pin_based_exec_ctrl(vmx);
+       exec_control |= (vmcs12->pin_based_vm_exec_control &
+                        ~PIN_BASED_VMX_PREEMPTION_TIMER);
 
        /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -2003,7 +2026,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
+       pin_controls_set(vmx, exec_control);
 
        /*
         * EXEC CONTROLS
@@ -2014,28 +2037,31 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
        exec_control &= ~CPU_BASED_TPR_SHADOW;
        exec_control |= vmcs12->cpu_based_vm_exec_control;
 
-       /*
-        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
-        * nested_get_vmcs12_pages can't fix it up, the illegal value
-        * will result in a VM entry failure.
-        */
-       if (exec_control & CPU_BASED_TPR_SHADOW) {
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+       if (exec_control & CPU_BASED_TPR_SHADOW)
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
-       } else {
 #ifdef CONFIG_X86_64
+       else
                exec_control |= CPU_BASED_CR8_LOAD_EXITING |
                                CPU_BASED_CR8_STORE_EXITING;
 #endif
-       }
 
        /*
         * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
         * for I/O port accesses.
         */
-       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+
+       /*
+        * This bit will be computed in nested_get_vmcs12_pages, because
+        * we do not have access to L1's MSR bitmap yet.  For now, keep
+        * the same bit as before, hoping to avoid multiple VMWRITEs that
+        * only set/clear this bit.
+        */
+       exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+       exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
+
+       exec_controls_set(vmx, exec_control);
 
        /*
         * SECONDARY EXEC CONTROLS
@@ -2061,22 +2087,19 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                /* VMCS shadowing for L2 is emulated for now */
                exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 
-               if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
-                       vmcs_write16(GUEST_INTR_STATUS,
-                               vmcs12->guest_intr_status);
-
                /*
-                * Write an illegal value to APIC_ACCESS_ADDR. Later,
-                * nested_get_vmcs12_pages will either fix it up or
-                * remove the VM execution control.
+                * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
+                * will not have to rewrite the controls just for this bit.
                 */
-               if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
-                       vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+               if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
+                   (vmcs12->guest_cr4 & X86_CR4_UMIP))
+                       exec_control |= SECONDARY_EXEC_DESC;
 
-               if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
-                       vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+               if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+                       vmcs_write16(GUEST_INTR_STATUS,
+                               vmcs12->guest_intr_status);
 
-               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+               secondary_exec_controls_set(vmx, exec_control);
        }
 
        /*
@@ -2095,7 +2118,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                if (guest_efer != host_efer)
                        exec_control |= VM_ENTRY_LOAD_IA32_EFER;
        }
-       vm_entry_controls_init(vmx, exec_control);
+       vm_entry_controls_set(vmx, exec_control);
 
        /*
         * EXIT CONTROLS
@@ -2107,17 +2130,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
        exec_control = vmx_vmexit_ctrl();
        if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
                exec_control |= VM_EXIT_LOAD_IA32_EFER;
-       vm_exit_controls_init(vmx, exec_control);
-
-       /*
-        * Conceptually we want to copy the PML address and index from
-        * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
-        * since we always flush the log on each vmexit and never change
-        * the PML address (once set), this happens to be equivalent to
-        * simply resetting the index in vmcs02.
-        */
-       if (enable_pml)
-               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+       vm_exit_controls_set(vmx, exec_control);
 
        /*
         * Interrupt/Exception Fields
@@ -2138,7 +2151,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
        }
 }
 
-static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 {
        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
 
@@ -2162,6 +2175,8 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
                vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
                vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+               vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+               vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
                vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
                vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
                vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
@@ -2198,6 +2213,10 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                        vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
                        vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
                }
+
+               if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+                   (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
        }
 
        if (nested_cpu_has_xsaves(vmcs12))
@@ -2233,14 +2252,6 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
 
        set_cr4_guest_host_mask(vmx);
-
-       if (kvm_mpx_supported()) {
-               if (vmx->nested.nested_run_pending &&
-                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
-                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
-               else
-                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
-       }
 }
 
 /*
@@ -2259,20 +2270,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+       bool load_guest_pdptrs_vmcs12 = false;
 
-       if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
-               prepare_vmcs02_full(vmx, vmcs12);
+       if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
+               prepare_vmcs02_rare(vmx, vmcs12);
                vmx->nested.dirty_vmcs12 = false;
-       }
 
-       /*
-        * First, the fields that are shadowed.  This must be kept in sync
-        * with vmcs_shadow_fields.h.
-        */
-       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
-                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
-               vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
-               vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+               load_guest_pdptrs_vmcs12 = !hv_evmcs ||
+                       !(hv_evmcs->hv_clean_fields &
+                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
        }
 
        if (vmx->nested.nested_run_pending &&
@@ -2283,6 +2289,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
        }
+       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+               vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 
        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2372,6 +2381,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                                entry_failure_code))
                return -EINVAL;
 
+       /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
+       if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
+           is_pae_paging(vcpu)) {
+               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+       }
+
        if (!enable_ept)
                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
 
@@ -2609,6 +2627,30 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
            !kvm_pat_valid(vmcs12->host_ia32_pat))
                return -EINVAL;
 
+       ia32e = (vmcs12->vm_exit_controls &
+                VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+
+       if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+           vmcs12->host_cs_selector == 0 ||
+           vmcs12->host_tr_selector == 0 ||
+           (vmcs12->host_ss_selector == 0 && !ia32e))
+               return -EINVAL;
+
+#ifdef CONFIG_X86_64
+       if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
+           is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
+           is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
+           is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
+           is_noncanonical_address(vmcs12->host_tr_base, vcpu))
+               return -EINVAL;
+#endif
+
        /*
         * If the load IA32_EFER VM-exit control is 1, bits reserved in the
         * IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2616,8 +2658,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
         * the host address-space size VM-exit control.
         */
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
-               ia32e = (vmcs12->vm_exit_controls &
-                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
                if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
@@ -2781,7 +2821,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
                [launched]"i"(offsetof(struct loaded_vmcs, launched)),
                [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
                [wordsize]"i"(sizeof(ulong))
-             : "cc", "memory"
+             : "memory"
        );
 
        if (vmx->msr_autoload.host.nr)
@@ -2851,18 +2891,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
                        hpa = page_to_phys(vmx->nested.apic_access_page);
                        vmcs_write64(APIC_ACCESS_ADDR, hpa);
                } else {
-                       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-                                       SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+                       secondary_exec_controls_clearbit(vmx,
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
                }
        }
 
        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
                map = &vmx->nested.virtual_apic_map;
 
-               /*
-                * If translation failed, VM entry will fail because
-                * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
-                */
                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
                } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
@@ -2876,11 +2912,13 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
                         * _not_ what the processor does but it's basically the
                         * only possibility we have.
                         */
-                       vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
-                                       CPU_BASED_TPR_SHADOW);
+                       exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
                } else {
-                       printk("bad virtual-APIC page address\n");
-                       dump_vmcs();
+                       /*
+                        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
+                        * force VM-Entry to fail.
+                        */
+                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
                }
        }
 
@@ -2896,11 +2934,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
                }
        }
        if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
-               vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
-                             CPU_BASED_USE_MSR_BITMAPS);
+               exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
        else
-               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
-                               CPU_BASED_USE_MSR_BITMAPS);
+               exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
 }
 
 /*
@@ -2953,7 +2989,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        u32 exit_reason = EXIT_REASON_INVALID_STATE;
        u32 exit_qual;
 
-       evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+       evaluate_pending_interrupts = exec_controls_get(vmx) &
                (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
@@ -2964,6 +3000,25 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
                !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
                vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 
+       /*
+        * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
+        * nested early checks are disabled.  In the event of a "late" VM-Fail,
+        * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
+        * software model to the pre-VMEntry host state.  When EPT is disabled,
+        * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
+        * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
+        * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
+        * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
+        * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
+        * guaranteed to be overwritten with a shadow CR3 prior to re-entering
+        * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
+        * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
+        * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
+        * path would need to manually save/restore vmcs01.GUEST_CR3.
+        */
+       if (!enable_ept && !nested_early_check)
+               vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
 
        prepare_vmcs02_early(vmx, vmcs12);
@@ -3059,7 +3114,7 @@ vmentry_fail_vmexit:
        vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
        vmcs12->exit_qualification = exit_qual;
        if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
-               vmx->nested.need_vmcs12_sync = true;
+               vmx->nested.need_vmcs12_to_shadow_sync = true;
        return 1;
 }
 
@@ -3077,7 +3132,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
+       if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
                return 1;
 
        if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
@@ -3393,20 +3448,57 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
        return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
 }
 
-/*
- * Update the guest state fields of vmcs12 to reflect changes that
- * occurred while L2 was running. (The "IA-32e mode guest" bit of the
- * VM-entry controls is also updated, since this is really a guest
- * state bit.)
- */
-static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
-       vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
-       vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+static bool is_vmcs12_ext_field(unsigned long field)
+{
+       switch (field) {
+       case GUEST_ES_SELECTOR:
+       case GUEST_CS_SELECTOR:
+       case GUEST_SS_SELECTOR:
+       case GUEST_DS_SELECTOR:
+       case GUEST_FS_SELECTOR:
+       case GUEST_GS_SELECTOR:
+       case GUEST_LDTR_SELECTOR:
+       case GUEST_TR_SELECTOR:
+       case GUEST_ES_LIMIT:
+       case GUEST_CS_LIMIT:
+       case GUEST_SS_LIMIT:
+       case GUEST_DS_LIMIT:
+       case GUEST_FS_LIMIT:
+       case GUEST_GS_LIMIT:
+       case GUEST_LDTR_LIMIT:
+       case GUEST_TR_LIMIT:
+       case GUEST_GDTR_LIMIT:
+       case GUEST_IDTR_LIMIT:
+       case GUEST_ES_AR_BYTES:
+       case GUEST_DS_AR_BYTES:
+       case GUEST_FS_AR_BYTES:
+       case GUEST_GS_AR_BYTES:
+       case GUEST_LDTR_AR_BYTES:
+       case GUEST_TR_AR_BYTES:
+       case GUEST_ES_BASE:
+       case GUEST_CS_BASE:
+       case GUEST_SS_BASE:
+       case GUEST_DS_BASE:
+       case GUEST_FS_BASE:
+       case GUEST_GS_BASE:
+       case GUEST_LDTR_BASE:
+       case GUEST_TR_BASE:
+       case GUEST_GDTR_BASE:
+       case GUEST_IDTR_BASE:
+       case GUEST_PENDING_DBG_EXCEPTIONS:
+       case GUEST_BNDCFGS:
+               return true;
+       default:
+               break;
+       }
 
-       vmcs12->guest_rsp = kvm_rsp_read(vcpu);
-       vmcs12->guest_rip = kvm_rip_read(vcpu);
-       vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+       return false;
+}
+
+static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+                                      struct vmcs12 *vmcs12)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
        vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
@@ -3427,8 +3519,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
        vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
        vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
-       vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
-       vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
        vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
        vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
        vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
@@ -3444,11 +3534,69 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
        vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
        vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+       vmcs12->guest_pending_dbg_exceptions =
+               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+       if (kvm_mpx_supported())
+               vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+       vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
+}
+
+static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+                                      struct vmcs12 *vmcs12)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int cpu;
+
+       if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
+               return;
+
+
+       WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
+
+       cpu = get_cpu();
+       vmx->loaded_vmcs = &vmx->nested.vmcs02;
+       vmx_vcpu_load(&vmx->vcpu, cpu);
+
+       sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx_vcpu_load(&vmx->vcpu, cpu);
+       put_cpu();
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vmx->nested.hv_evmcs)
+               sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+       vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+
+       vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+       vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+       vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+       vmcs12->guest_rip = kvm_rip_read(vcpu);
+       vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+       vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+       vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+
+       vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+       vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+       vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
 
        vmcs12->guest_interruptibility_info =
                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-       vmcs12->guest_pending_dbg_exceptions =
-               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
                vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
        else
@@ -3469,10 +3617,12 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         */
        if (enable_ept) {
                vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
-               vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
-               vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
-               vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
-               vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+               if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+                       vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+                       vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+                       vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+                       vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+               }
        }
 
        vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
@@ -3484,22 +3634,11 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
 
-       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
                kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
-               vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-       }
 
-       /* TODO: These cannot have changed unless we have MSR bitmaps and
-        * the relevant bit asks not to trap the change */
-       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
-               vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
                vmcs12->guest_ia32_efer = vcpu->arch.efer;
-       vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
-       vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
-       vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-       if (kvm_mpx_supported())
-               vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 }
 
 /*
@@ -3517,11 +3656,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                           u32 exit_reason, u32 exit_intr_info,
                           unsigned long exit_qualification)
 {
-       /* update guest state fields: */
-       sync_vmcs12(vcpu, vmcs12);
-
        /* update exit information fields: */
-
        vmcs12->vm_exit_reason = exit_reason;
        vmcs12->exit_qualification = exit_qualification;
        vmcs12->vm_exit_intr_info = exit_intr_info;
@@ -3775,18 +3910,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
        vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
 
        nested_ept_uninit_mmu_context(vcpu);
-
-       /*
-        * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
-        * points to shadow pages!  Fortunately we only get here after a WARN_ON
-        * if EPT is disabled, so a VMabort is perfectly fine.
-        */
-       if (enable_ept) {
-               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-               __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-       } else {
-               nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
-       }
+       vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 
        /*
         * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -3794,7 +3919,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
         * VMFail, like everything else we just need to ensure our
         * software model is up-to-date.
         */
-       ept_save_pdptrs(vcpu);
+       if (enable_ept)
+               ept_save_pdptrs(vcpu);
 
        kvm_mmu_reset_context(vcpu);
 
@@ -3882,14 +4008,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
 
        if (likely(!vmx->fail)) {
-               if (exit_reason == -1)
-                       sync_vmcs12(vcpu, vmcs12);
-               else
+               sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+
+               if (exit_reason != -1)
                        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                                       exit_qualification);
 
                /*
-                * Must happen outside of sync_vmcs12() as it will
+                * Must happen outside of sync_vmcs02_to_vmcs12() as it will
                 * also be used to capture vmcs12 cache as part of
                 * capturing nVMX state for snapshot (migration).
                 *
@@ -3945,7 +4071,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
        if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
-               vmx->nested.need_vmcs12_sync = true;
+               vmx->nested.need_vmcs12_to_shadow_sync = true;
 
        /* in case we halted in L2 */
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4008,7 +4134,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
  * #UD or #GP.
  */
 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
-                       u32 vmx_instruction_info, bool wr, gva_t *ret)
+                       u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
 {
        gva_t off;
        bool exn;
@@ -4115,7 +4241,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
                 */
                if (!(s.base == 0 && s.limit == 0xffffffff &&
                     ((s.type & 8) || !(s.type & 4))))
-                       exn = exn || (off + sizeof(u64) > s.limit);
+                       exn = exn || ((u64)off + len - 1 > s.limit);
        }
        if (exn) {
                kvm_queue_exception_e(vcpu,
@@ -4134,7 +4260,8 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
        struct x86_exception e;
 
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-                       vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
+                               vmcs_read32(VMX_INSTRUCTION_INFO), false,
+                               sizeof(*vmpointer), &gva))
                return 1;
 
        if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
@@ -4300,11 +4427,13 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
        if (vmx->nested.current_vmptr == -1ull)
                return;
 
+       copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+
        if (enable_shadow_vmcs) {
                /* copy to memory all shadowed fields in case
                   they were modified */
                copy_shadow_to_vmcs12(vmx);
-               vmx->nested.need_vmcs12_sync = false;
+               vmx->nested.need_vmcs12_to_shadow_sync = false;
                vmx_disable_shadow_vmcs(vmx);
        }
        vmx->nested.posted_intr_nv = -1;
@@ -4334,6 +4463,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 zero = 0;
        gpa_t vmptr;
+       u64 evmcs_gpa;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
@@ -4349,10 +4479,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
                return nested_vmx_failValid(vcpu,
                        VMXERR_VMCLEAR_VMXON_POINTER);
 
-       if (vmx->nested.hv_evmcs_map.hva) {
-               if (vmptr == vmx->nested.hv_evmcs_vmptr)
-                       nested_release_evmcs(vcpu);
-       } else {
+       /*
+        * When Enlightened VMEntry is enabled on the calling CPU we treat
+        * memory area pointer by vmptr as Enlightened VMCS (as there's no good
+        * way to distinguish it from VMCS12) and we must not corrupt it by
+        * writing to the non-existent 'launch_state' field. The area doesn't
+        * have to be the currently active EVMCS on the calling CPU and there's
+        * nothing KVM has to do to transition it from 'active' to 'non-active'
+        * state. It is possible that the area will stay mapped as
+        * vmx->nested.hv_evmcs but this shouldn't be a problem.
+        */
+       if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+                  !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
                if (vmptr == vmx->nested.current_vmptr)
                        nested_release_vmcs12(vcpu);
 
@@ -4386,8 +4524,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
        u64 field_value;
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       int len;
        gva_t gva = 0;
        struct vmcs12 *vmcs12;
+       short offset;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
@@ -4409,11 +4549,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
-       /* Read the field, zero-extended to a u64 field_value */
-       if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+
+       offset = vmcs_field_to_offset(field);
+       if (offset < 0)
                return nested_vmx_failValid(vcpu,
                        VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 
+       if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+               copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+       /* Read the field, zero-extended to a u64 field_value */
+       field_value = vmcs12_read_any(vmcs12, field, offset);
+
        /*
         * Now copy part of this value to register or memory, as requested.
         * Note that the number of bits actually copied is 32 or 64 depending
@@ -4423,21 +4570,45 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
                        field_value);
        } else {
+               len = is_64_bit_mode(vcpu) ? 8 : 4;
                if (get_vmx_mem_address(vcpu, exit_qualification,
-                               vmx_instruction_info, true, &gva))
+                               vmx_instruction_info, true, len, &gva))
                        return 1;
                /* _system ok, nested_vmx_check_permission has verified cpl=0 */
-               kvm_write_guest_virt_system(vcpu, gva, &field_value,
-                                           (is_long_mode(vcpu) ? 8 : 4), NULL);
+               kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
        }
 
        return nested_vmx_succeed(vcpu);
 }
 
+static bool is_shadow_field_rw(unsigned long field)
+{
+       switch (field) {
+#define SHADOW_FIELD_RW(x, y) case x:
+#include "vmcs_shadow_fields.h"
+               return true;
+       default:
+               break;
+       }
+       return false;
+}
+
+static bool is_shadow_field_ro(unsigned long field)
+{
+       switch (field) {
+#define SHADOW_FIELD_RO(x, y) case x:
+#include "vmcs_shadow_fields.h"
+               return true;
+       default:
+               break;
+       }
+       return false;
+}
 
 static int handle_vmwrite(struct kvm_vcpu *vcpu)
 {
        unsigned long field;
+       int len;
        gva_t gva;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4452,6 +4623,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        u64 field_value = 0;
        struct x86_exception e;
        struct vmcs12 *vmcs12;
+       short offset;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
@@ -4463,11 +4635,11 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                field_value = kvm_register_readl(vcpu,
                        (((vmx_instruction_info) >> 3) & 0xf));
        else {
+               len = is_64_bit_mode(vcpu) ? 8 : 4;
                if (get_vmx_mem_address(vcpu, exit_qualification,
-                               vmx_instruction_info, false, &gva))
+                               vmx_instruction_info, false, len, &gva))
                        return 1;
-               if (kvm_read_guest_virt(vcpu, gva, &field_value,
-                                       (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+               if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
                        kvm_inject_page_fault(vcpu, &e);
                        return 1;
                }
@@ -4484,9 +4656,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                return nested_vmx_failValid(vcpu,
                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
 
-       if (!is_guest_mode(vcpu))
+       if (!is_guest_mode(vcpu)) {
                vmcs12 = get_vmcs12(vcpu);
-       else {
+
+               /*
+                * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+                * vmcs12, else we may crush a field or consume a stale value.
+                */
+               if (!is_shadow_field_rw(field))
+                       copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+       } else {
                /*
                 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
@@ -4496,28 +4675,46 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                vmcs12 = get_shadow_vmcs12(vcpu);
        }
 
-       if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+       offset = vmcs_field_to_offset(field);
+       if (offset < 0)
                return nested_vmx_failValid(vcpu,
                        VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 
        /*
-        * Do not track vmcs12 dirty-state if in guest-mode
-        * as we actually dirty shadow vmcs12 instead of vmcs12.
+        * Some Intel CPUs intentionally drop the reserved bits of the AR byte
+        * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
+        * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
+        * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
+        * from L1 will return a different value than VMREAD from L2 (L1 sees
+        * the stripped down value, L2 sees the full value as stored by KVM).
         */
-       if (!is_guest_mode(vcpu)) {
-               switch (field) {
-#define SHADOW_FIELD_RW(x) case x:
-#include "vmcs_shadow_fields.h"
-                       /*
-                        * The fields that can be updated by L1 without a vmexit are
-                        * always updated in the vmcs02, the others go down the slow
-                        * path of prepare_vmcs02.
-                        */
-                       break;
-               default:
-                       vmx->nested.dirty_vmcs12 = true;
-                       break;
+       if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
+               field_value &= 0x1f0ff;
+
+       vmcs12_write_any(vmcs12, field, offset, field_value);
+
+       /*
+        * Do not track vmcs12 dirty-state if in guest-mode as we actually
+        * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
+        * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
+        * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
+        */
+       if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
+               /*
+                * L1 can read these fields without exiting, ensure the
+                * shadow VMCS is up-to-date.
+                */
+               if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
+                       preempt_disable();
+                       vmcs_load(vmx->vmcs01.shadow_vmcs);
+
+                       __vmcs_writel(field, field_value);
+
+                       vmcs_clear(vmx->vmcs01.shadow_vmcs);
+                       vmcs_load(vmx->loaded_vmcs->vmcs);
+                       preempt_enable();
                }
+               vmx->nested.dirty_vmcs12 = true;
        }
 
        return nested_vmx_succeed(vcpu);
@@ -4527,11 +4724,10 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
 {
        vmx->nested.current_vmptr = vmptr;
        if (enable_shadow_vmcs) {
-               vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-                             SECONDARY_EXEC_SHADOW_VMCS);
+               secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
                vmcs_write64(VMCS_LINK_POINTER,
                             __pa(vmx->vmcs01.shadow_vmcs));
-               vmx->nested.need_vmcs12_sync = true;
+               vmx->nested.need_vmcs12_to_shadow_sync = true;
        }
        vmx->nested.dirty_vmcs12 = true;
 }
@@ -4615,7 +4811,8 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
                return 1;
 
-       if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
+       if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
+                               true, sizeof(gpa_t), &gva))
                return 1;
        /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
        if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
@@ -4661,7 +4858,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         * operand is read even if it isn't needed (e.g., for type==global)
         */
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-                       vmx_instruction_info, false, &gva))
+                       vmx_instruction_info, false, sizeof(operand), &gva))
                return 1;
        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                kvm_inject_page_fault(vcpu, &e);
@@ -4670,13 +4867,11 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
        switch (type) {
        case VMX_EPT_EXTENT_GLOBAL:
+       case VMX_EPT_EXTENT_CONTEXT:
        /*
-        * TODO: track mappings and invalidate
-        * single context requests appropriately
+        * TODO: Sync the necessary shadow EPT roots here, rather than
+        * at the next emulated VM-entry.
         */
-       case VMX_EPT_EXTENT_CONTEXT:
-               kvm_mmu_sync_roots(vcpu);
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                break;
        default:
                BUG_ON(1);
@@ -4723,7 +4918,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         * operand is read even if it isn't needed (e.g., for type==global)
         */
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-                       vmx_instruction_info, false, &gva))
+                       vmx_instruction_info, false, sizeof(operand), &gva))
                return 1;
        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                kvm_inject_page_fault(vcpu, &e);
@@ -5284,12 +5479,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
         * When running L2, the authoritative vmcs12 state is in the
         * vmcs02. When running L1, the authoritative vmcs12 state is
         * in the shadow or enlightened vmcs linked to vmcs01, unless
-        * need_vmcs12_sync is set, in which case, the authoritative
+        * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
         * vmcs12 state is in the vmcs12 already.
         */
        if (is_guest_mode(vcpu)) {
-               sync_vmcs12(vcpu, vmcs12);
-       } else if (!vmx->nested.need_vmcs12_sync) {
+               sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+               sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+       } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
                if (vmx->nested.hv_evmcs)
                        copy_enlightened_to_vmcs12(vmx);
                else if (enable_shadow_vmcs)
@@ -5411,7 +5607,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
                 * Sync eVMCS upon entry as we may not have
                 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
                 */
-               vmx->nested.need_vmcs12_sync = true;
+               vmx->nested.need_vmcs12_to_shadow_sync = true;
        } else {
                return -EINVAL;
        }
@@ -5479,14 +5675,8 @@ error_guest_mode:
 void nested_vmx_vcpu_setup(void)
 {
        if (enable_shadow_vmcs) {
-               /*
-                * At vCPU creation, "VMWRITE to any supported field
-                * in the VMCS" is supported, so use the more
-                * permissive vmx_vmread_bitmap to specify both read
-                * and write permissions for the shadow VMCS.
-                */
                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
-               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
+               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
        }
 }
 
@@ -5616,10 +5806,15 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
        msrs->secondary_ctls_low = 0;
        msrs->secondary_ctls_high &=
                SECONDARY_EXEC_DESC |
+               SECONDARY_EXEC_RDTSCP |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+               SECONDARY_EXEC_WBINVD_EXITING |
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-               SECONDARY_EXEC_WBINVD_EXITING;
+               SECONDARY_EXEC_RDRAND_EXITING |
+               SECONDARY_EXEC_ENABLE_INVPCID |
+               SECONDARY_EXEC_RDSEED_EXITING |
+               SECONDARY_EXEC_XSAVES;
 
        /*
         * We can emulate "VMCS shadowing," even if the hardware
@@ -5739,14 +5934,6 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 {
        int i;
 
-       /*
-        * Without EPT it is not possible to restore L1's CR3 and PDPTR on
-        * VMfail, because they are not available in vmcs01.  Just always
-        * use hardware checks.
-        */
-       if (!enable_ept)
-               nested_early_check = 1;
-
        if (!cpu_has_vmx_shadow_vmcs())
                enable_shadow_vmcs = 0;
        if (enable_shadow_vmcs) {
index e847ff1019a29628d9e2e3fb71e3344e1f51d3b2..187d39bf0bf10ca178cc3f1a7b45e18b7000ccbe 100644 (file)
@@ -17,11 +17,11 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                       u32 exit_intr_info, unsigned long exit_qualification);
-void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu);
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
-                       u32 vmx_instruction_info, bool wr, gva_t *ret);
+                       u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
index b8e50f76fefcb818c1e82f83435447bbcf1e3228..2200fb698dd0fed2ed62576ca08e2232158ae274 100644 (file)
@@ -146,7 +146,6 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
 
        __vmcs_writel(field, value);
 #ifndef CONFIG_X86_64
-       asm volatile ("");
        __vmcs_writel(field+1, value >> 32);
 #endif
 }
index cb6079f8a227f4f63aa38e2da493e49a6e44764b..481ad879197b867f2e46927a8c5c5cbf501918d4 100644 (file)
@@ -42,6 +42,14 @@ struct vmcs_host_state {
 #endif
 };
 
+struct vmcs_controls_shadow {
+       u32 vm_entry;
+       u32 vm_exit;
+       u32 pin;
+       u32 exec;
+       u32 secondary_exec;
+};
+
 /*
  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
@@ -53,7 +61,7 @@ struct loaded_vmcs {
        int cpu;
        bool launched;
        bool nmi_known_unmasked;
-       bool hv_timer_armed;
+       bool hv_timer_soft_disabled;
        /* Support for vnmi-less CPUs */
        int soft_vnmi_blocked;
        ktime_t entry_time;
@@ -61,6 +69,7 @@ struct loaded_vmcs {
        unsigned long *msr_bitmap;
        struct list_head loaded_vmcss_on_cpu_link;
        struct vmcs_host_state host_state;
+       struct vmcs_controls_shadow controls_shadow;
 };
 
 static inline bool is_exception_n(u32 intr_info, u8 vector)
@@ -115,6 +124,12 @@ static inline bool is_nmi(u32 intr_info)
                == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
 }
 
+static inline bool is_external_intr(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+               == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR);
+}
+
 enum vmcs_field_width {
        VMCS_FIELD_WIDTH_U16 = 0,
        VMCS_FIELD_WIDTH_U64 = 1,
index 337718fc8a36f9359d52c1633bebc49eab56874d..d0c6df373f6765360318027a6fbd781c519bcad5 100644 (file)
@@ -395,69 +395,48 @@ static inline short vmcs_field_to_offset(unsigned long field)
 
 #undef ROL16
 
-/*
- * Read a vmcs12 field. Since these can have varying lengths and we return
- * one type, we chose the biggest type (u64) and zero-extend the return value
- * to that size. Note that the caller, handle_vmread, might need to use only
- * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
- * 64-bit fields are to be returned).
- */
-static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
-                                 unsigned long field, u64 *ret)
+static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
+                                 u16 offset)
 {
-       short offset = vmcs_field_to_offset(field);
-       char *p;
-
-       if (offset < 0)
-               return offset;
-
-       p = (char *)vmcs12 + offset;
+       char *p = (char *)vmcs12 + offset;
 
        switch (vmcs_field_width(field)) {
        case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
-               *ret = *((natural_width *)p);
-               return 0;
+               return *((natural_width *)p);
        case VMCS_FIELD_WIDTH_U16:
-               *ret = *((u16 *)p);
-               return 0;
+               return *((u16 *)p);
        case VMCS_FIELD_WIDTH_U32:
-               *ret = *((u32 *)p);
-               return 0;
+               return *((u32 *)p);
        case VMCS_FIELD_WIDTH_U64:
-               *ret = *((u64 *)p);
-               return 0;
+               return *((u64 *)p);
        default:
-               WARN_ON(1);
-               return -ENOENT;
+               WARN_ON_ONCE(1);
+               return -1;
        }
 }
 
-static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
-                                  unsigned long field, u64 field_value){
-       short offset = vmcs_field_to_offset(field);
+static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field,
+                                   u16 offset, u64 field_value)
+{
        char *p = (char *)vmcs12 + offset;
 
-       if (offset < 0)
-               return offset;
-
        switch (vmcs_field_width(field)) {
        case VMCS_FIELD_WIDTH_U16:
                *(u16 *)p = field_value;
-               return 0;
+               break;
        case VMCS_FIELD_WIDTH_U32:
                *(u32 *)p = field_value;
-               return 0;
+               break;
        case VMCS_FIELD_WIDTH_U64:
                *(u64 *)p = field_value;
-               return 0;
+               break;
        case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
                *(natural_width *)p = field_value;
-               return 0;
+               break;
        default:
-               WARN_ON(1);
-               return -ENOENT;
+               WARN_ON_ONCE(1);
+               break;
        }
-
 }
 
 #endif /* __KVM_X86_VMX_VMCS12_H */
index 132432f375c2c826f259c4938eae093f9d7774ba..eb1ecd16fd220a06c26dc7df1f69b805753aa923 100644 (file)
@@ -1,8 +1,12 @@
+#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW)
+BUILD_BUG_ON(1)
+#endif
+
 #ifndef SHADOW_FIELD_RO
-#define SHADOW_FIELD_RO(x)
+#define SHADOW_FIELD_RO(x, y)
 #endif
 #ifndef SHADOW_FIELD_RW
-#define SHADOW_FIELD_RW(x)
+#define SHADOW_FIELD_RW(x, y)
 #endif
 
 /*
  */
 
 /* 16-bits */
-SHADOW_FIELD_RW(GUEST_INTR_STATUS)
-SHADOW_FIELD_RW(GUEST_PML_INDEX)
-SHADOW_FIELD_RW(HOST_FS_SELECTOR)
-SHADOW_FIELD_RW(HOST_GS_SELECTOR)
+SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status)
+SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector)
 
 /* 32-bits */
-SHADOW_FIELD_RO(VM_EXIT_REASON)
-SHADOW_FIELD_RO(VM_EXIT_INTR_INFO)
-SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN)
-SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD)
-SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE)
-SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE)
-SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL)
-SHADOW_FIELD_RW(EXCEPTION_BITMAP)
-SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
-SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
-SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
-SHADOW_FIELD_RW(TPR_THRESHOLD)
-SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
-SHADOW_FIELD_RW(GUEST_SS_AR_BYTES)
-SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
-SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
+SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code)
+SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes)
+SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control)
+SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len)
+SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value)
 
 /* Natural width */
-SHADOW_FIELD_RO(EXIT_QUALIFICATION)
-SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS)
-SHADOW_FIELD_RW(GUEST_RIP)
-SHADOW_FIELD_RW(GUEST_RSP)
-SHADOW_FIELD_RW(GUEST_CR0)
-SHADOW_FIELD_RW(GUEST_CR3)
-SHADOW_FIELD_RW(GUEST_CR4)
-SHADOW_FIELD_RW(GUEST_RFLAGS)
-SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
-SHADOW_FIELD_RW(CR0_READ_SHADOW)
-SHADOW_FIELD_RW(CR4_READ_SHADOW)
-SHADOW_FIELD_RW(HOST_FS_BASE)
-SHADOW_FIELD_RW(HOST_GS_BASE)
+SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address)
+SHADOW_FIELD_RW(GUEST_RIP, guest_rip)
+SHADOW_FIELD_RW(GUEST_RSP, guest_rsp)
+SHADOW_FIELD_RW(GUEST_CR0, guest_cr0)
+SHADOW_FIELD_RW(GUEST_CR3, guest_cr3)
+SHADOW_FIELD_RW(GUEST_CR4, guest_cr4)
+SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask)
+SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow)
+SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow)
+SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base)
+SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
 
 /* 64-bit */
-SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS)
-SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
 
 #undef SHADOW_FIELD_RO
 #undef SHADOW_FIELD_RW
index d98eac371c0aea071d90bd9cce713b73b13f3127..69536553446dbcb54be8e69176fc95c73c20a575 100644 (file)
@@ -389,6 +389,7 @@ static const struct kvm_vmx_segment_field {
 };
 
 u64 host_efer;
+static unsigned long host_idt_base;
 
 /*
  * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
@@ -1035,6 +1036,33 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
        wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
 }
 
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+                       unsigned long fs_base, unsigned long gs_base)
+{
+       if (unlikely(fs_sel != host->fs_sel)) {
+               if (!(fs_sel & 7))
+                       vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+               else
+                       vmcs_write16(HOST_FS_SELECTOR, 0);
+               host->fs_sel = fs_sel;
+       }
+       if (unlikely(gs_sel != host->gs_sel)) {
+               if (!(gs_sel & 7))
+                       vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+               else
+                       vmcs_write16(HOST_GS_SELECTOR, 0);
+               host->gs_sel = gs_sel;
+       }
+       if (unlikely(fs_base != host->fs_base)) {
+               vmcs_writel(HOST_FS_BASE, fs_base);
+               host->fs_base = fs_base;
+       }
+       if (unlikely(gs_base != host->gs_base)) {
+               vmcs_writel(HOST_GS_BASE, gs_base);
+               host->gs_base = gs_base;
+       }
+}
+
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1053,20 +1081,18 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
         * when guest state is loaded. This happens when guest transitions
         * to/from long-mode by setting MSR_EFER.LMA.
         */
-       if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
-               vmx->guest_msrs_dirty = false;
+       if (!vmx->guest_msrs_ready) {
+               vmx->guest_msrs_ready = true;
                for (i = 0; i < vmx->save_nmsrs; ++i)
                        kvm_set_shared_msr(vmx->guest_msrs[i].index,
                                           vmx->guest_msrs[i].data,
                                           vmx->guest_msrs[i].mask);
 
        }
-
-       if (vmx->loaded_cpu_state)
+       if (vmx->guest_state_loaded)
                return;
 
-       vmx->loaded_cpu_state = vmx->loaded_vmcs;
-       host_state = &vmx->loaded_cpu_state->host_state;
+       host_state = &vmx->loaded_vmcs->host_state;
 
        /*
         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
@@ -1100,42 +1126,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
        gs_base = segment_base(gs_sel);
 #endif
 
-       if (unlikely(fs_sel != host_state->fs_sel)) {
-               if (!(fs_sel & 7))
-                       vmcs_write16(HOST_FS_SELECTOR, fs_sel);
-               else
-                       vmcs_write16(HOST_FS_SELECTOR, 0);
-               host_state->fs_sel = fs_sel;
-       }
-       if (unlikely(gs_sel != host_state->gs_sel)) {
-               if (!(gs_sel & 7))
-                       vmcs_write16(HOST_GS_SELECTOR, gs_sel);
-               else
-                       vmcs_write16(HOST_GS_SELECTOR, 0);
-               host_state->gs_sel = gs_sel;
-       }
-       if (unlikely(fs_base != host_state->fs_base)) {
-               vmcs_writel(HOST_FS_BASE, fs_base);
-               host_state->fs_base = fs_base;
-       }
-       if (unlikely(gs_base != host_state->gs_base)) {
-               vmcs_writel(HOST_GS_BASE, gs_base);
-               host_state->gs_base = gs_base;
-       }
+       vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+       vmx->guest_state_loaded = true;
 }
 
 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 {
        struct vmcs_host_state *host_state;
 
-       if (!vmx->loaded_cpu_state)
+       if (!vmx->guest_state_loaded)
                return;
 
-       WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
-       host_state = &vmx->loaded_cpu_state->host_state;
+       host_state = &vmx->loaded_vmcs->host_state;
 
        ++vmx->vcpu.stat.host_state_reload;
-       vmx->loaded_cpu_state = NULL;
 
 #ifdef CONFIG_X86_64
        rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -1161,13 +1165,15 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
        load_fixmap_gdt(raw_smp_processor_id());
+       vmx->guest_state_loaded = false;
+       vmx->guest_msrs_ready = false;
 }
 
 #ifdef CONFIG_X86_64
 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
 {
        preempt_disable();
-       if (vmx->loaded_cpu_state)
+       if (vmx->guest_state_loaded)
                rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
        preempt_enable();
        return vmx->msr_guest_kernel_gs_base;
@@ -1176,7 +1182,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
 {
        preempt_disable();
-       if (vmx->loaded_cpu_state)
+       if (vmx->guest_state_loaded)
                wrmsrl(MSR_KERNEL_GS_BASE, data);
        preempt_enable();
        vmx->msr_guest_kernel_gs_base = data;
@@ -1225,11 +1231,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
                pi_set_on(pi_desc);
 }
 
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1290,8 +1292,20 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        if (kvm_has_tsc_control &&
            vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
                decache_tsc_multiplier(vmx);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       vmx_vcpu_load_vmcs(vcpu, cpu);
 
        vmx_vcpu_pi_load(vcpu, cpu);
+
        vmx->host_pkru = read_pkru();
        vmx->host_debugctlmsr = get_debugctlmsr();
 }
@@ -1310,7 +1324,7 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
                pi_set_sn(pi_desc);
 }
 
-void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
        vmx_vcpu_pi_put(vcpu);
 
@@ -1579,7 +1593,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                move_msr_up(vmx, index, save_nmsrs++);
 
        vmx->save_nmsrs = save_nmsrs;
-       vmx->guest_msrs_dirty = true;
+       vmx->guest_msrs_ready = false;
 
        if (cpu_has_vmx_msr_bitmap())
                vmx_update_msr_bitmap(&vmx->vcpu);
@@ -1692,9 +1706,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_SYSENTER_ESP:
                msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
-       case MSR_IA32_POWER_CTL:
-               msr_info->data = vmx->msr_ia32_power_ctl;
-               break;
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
                    (!msr_info->host_initiated &&
@@ -1718,7 +1729,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
                                       &msr_info->data);
        case MSR_IA32_XSS:
-               if (!vmx_xsaves_supported())
+               if (!vmx_xsaves_supported() ||
+                   (!msr_info->host_initiated &&
+                    !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+                      guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
                        return 1;
                msr_info->data = vcpu->arch.ia32_xss;
                break;
@@ -1817,17 +1831,28 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
+               if (is_guest_mode(vcpu))
+                       get_vmcs12(vcpu)->guest_sysenter_cs = data;
                vmcs_write32(GUEST_SYSENTER_CS, data);
                break;
        case MSR_IA32_SYSENTER_EIP:
+               if (is_guest_mode(vcpu))
+                       get_vmcs12(vcpu)->guest_sysenter_eip = data;
                vmcs_writel(GUEST_SYSENTER_EIP, data);
                break;
        case MSR_IA32_SYSENTER_ESP:
+               if (is_guest_mode(vcpu))
+                       get_vmcs12(vcpu)->guest_sysenter_esp = data;
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
-       case MSR_IA32_POWER_CTL:
-               vmx->msr_ia32_power_ctl = data;
+       case MSR_IA32_DEBUGCTLMSR:
+               if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+                                               VM_EXIT_SAVE_DEBUG_CONTROLS)
+                       get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+               ret = kvm_set_msr_common(vcpu, msr_info);
                break;
+
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
                    (!msr_info->host_initiated &&
@@ -1896,9 +1921,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                              MSR_TYPE_W);
                break;
        case MSR_IA32_CR_PAT:
+               if (!kvm_pat_valid(data))
+                       return 1;
+
+               if (is_guest_mode(vcpu) &&
+                   get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
+                       get_vmcs12(vcpu)->guest_ia32_pat = data;
+
                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-                       if (!kvm_pat_valid(data))
-                               return 1;
                        vmcs_write64(GUEST_IA32_PAT, data);
                        vcpu->arch.pat = data;
                        break;
@@ -1932,7 +1962,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                return vmx_set_vmx_msr(vcpu, msr_index, data);
        case MSR_IA32_XSS:
-               if (!vmx_xsaves_supported())
+               if (!vmx_xsaves_supported() ||
+                   (!msr_info->host_initiated &&
+                    !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+                      guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
                        return 1;
                /*
                 * The only supported bit as of Skylake is bit 8, but
@@ -2435,6 +2468,7 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
                return -ENOMEM;
 
        loaded_vmcs->shadow_vmcs = NULL;
+       loaded_vmcs->hv_timer_soft_disabled = false;
        loaded_vmcs_init(loaded_vmcs);
 
        if (cpu_has_vmx_msr_bitmap()) {
@@ -2455,6 +2489,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
        }
 
        memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+       memset(&loaded_vmcs->controls_shadow, 0,
+               sizeof(struct vmcs_controls_shadow));
 
        return 0;
 
@@ -2737,7 +2773,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
                      (unsigned long *)&vcpu->arch.regs_dirty))
                return;
 
-       if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+       if (is_pae_paging(vcpu)) {
                vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
                vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
                vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
@@ -2749,7 +2785,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 
-       if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+       if (is_pae_paging(vcpu)) {
                mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
                mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
                mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
@@ -2766,22 +2802,20 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
                                        unsigned long cr0,
                                        struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
        if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                vmx_decache_cr3(vcpu);
        if (!(cr0 & X86_CR0_PG)) {
                /* From paging/starting to nonpaging */
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
-                            vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
-                            (CPU_BASED_CR3_LOAD_EXITING |
-                             CPU_BASED_CR3_STORE_EXITING));
+               exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
+                                         CPU_BASED_CR3_STORE_EXITING);
                vcpu->arch.cr0 = cr0;
                vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
        } else if (!is_paging(vcpu)) {
                /* From nonpaging to paging */
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
-                            vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
-                            ~(CPU_BASED_CR3_LOAD_EXITING |
-                              CPU_BASED_CR3_STORE_EXITING));
+               exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
+                                           CPU_BASED_CR3_STORE_EXITING);
                vcpu->arch.cr0 = cr0;
                vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
        }
@@ -2881,6 +2915,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        /*
         * Pass through host's Machine Check Enable value to hw_cr4, which
         * is in force while we are in guest mode.  Do not let guests control
@@ -2891,20 +2926,19 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
        if (enable_unrestricted_guest)
                hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
-       else if (to_vmx(vcpu)->rmode.vm86_active)
+       else if (vmx->rmode.vm86_active)
                hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
        else
                hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
 
        if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
                if (cr4 & X86_CR4_UMIP) {
-                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-                               SECONDARY_EXEC_DESC);
+                       secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
                        hw_cr4 &= ~X86_CR4_UMIP;
                } else if (!is_guest_mode(vcpu) ||
-                       !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
-                       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-                                       SECONDARY_EXEC_DESC);
+                       !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
+                       secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
+               }
        }
 
        if (cr4 & X86_CR4_VMXE) {
@@ -2919,7 +2953,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                        return 1;
        }
 
-       if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+       if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
                return 1;
 
        vcpu->arch.cr4 = cr4;
@@ -3537,7 +3571,7 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
        u8 mode = 0;
 
        if (cpu_has_secondary_exec_ctrls() &&
-           (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+           (secondary_exec_controls_get(to_vmx(vcpu)) &
             SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
                mode |= MSR_BITMAP_MODE_X2APIC;
                if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
@@ -3731,7 +3765,6 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 {
        u32 low32, high32;
        unsigned long tmpl;
-       struct desc_ptr dt;
        unsigned long cr0, cr3, cr4;
 
        cr0 = read_cr0();
@@ -3767,9 +3800,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
-       store_idt(&dt);
-       vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
-       vmx->host_idt_base = dt.address;
+       vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
 
        vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
 
@@ -3798,7 +3829,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
        vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
-static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 {
        u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
@@ -3808,8 +3839,9 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
        if (!enable_vnmi)
                pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
 
-       /* Enable the preemption timer dynamically */
-       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       if (!enable_preemption_timer)
+               pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
        return pin_based_exec_ctrl;
 }
 
@@ -3817,14 +3849,14 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
        if (cpu_has_secondary_exec_ctrls()) {
                if (kvm_vcpu_apicv_active(vcpu))
-                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                       secondary_exec_controls_setbit(vmx,
                                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
                else
-                       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                       secondary_exec_controls_clearbit(vmx,
                                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
        }
@@ -4015,15 +4047,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
        /* Control */
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
        vmx->hv_deadline_tsc = -1;
 
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
+       exec_controls_set(vmx, vmx_exec_control(vmx));
 
        if (cpu_has_secondary_exec_ctrls()) {
                vmx_compute_secondary_exec_control(vmx);
-               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-                            vmx->secondary_exec_control);
+               secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
        }
 
        if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
@@ -4081,10 +4112,10 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                ++vmx->nmsrs;
        }
 
-       vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
+       vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
 
        /* 22.2.1, 20.8.1 */
-       vm_entry_controls_init(vmx, vmx_vmentry_ctrl());
+       vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
 
        vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
@@ -4208,8 +4239,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
-                     CPU_BASED_VIRTUAL_INTR_PENDING);
+       exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -4220,8 +4250,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
                return;
        }
 
-       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
-                     CPU_BASED_VIRTUAL_NMI_PENDING);
+       exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4442,11 +4471,11 @@ static void kvm_machine_check(void)
 
 static int handle_machine_check(struct kvm_vcpu *vcpu)
 {
-       /* already handled by vcpu_run */
+       /* handled by vmx_vcpu_run() */
        return 1;
 }
 
-static int handle_exception(struct kvm_vcpu *vcpu)
+static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_run *kvm_run = vcpu->run;
@@ -4458,11 +4487,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        vect_info = vmx->idt_vectoring_info;
        intr_info = vmx->exit_intr_info;
 
-       if (is_machine_check(intr_info))
-               return handle_machine_check(vcpu);
-
-       if (is_nmi(intr_info))
-               return 1;  /* already handled by vmx_vcpu_run() */
+       if (is_machine_check(intr_info) || is_nmi(intr_info))
+               return 1; /* handled by handle_exception_nmi_irqoff() */
 
        if (is_invalid_opcode(intr_info))
                return handle_ud(vcpu);
@@ -4518,7 +4544,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                dr6 = vmcs_readl(EXIT_QUALIFICATION);
                if (!(vcpu->guest_debug &
                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
-                       vcpu->arch.dr6 &= ~15;
+                       vcpu->arch.dr6 &= ~DR_TRAP_BITS;
                        vcpu->arch.dr6 |= dr6 | DR6_RTM;
                        if (is_icebp(intr_info))
                                skip_emulated_instruction(vcpu);
@@ -4763,7 +4789,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                        vcpu->run->exit_reason = KVM_EXIT_DEBUG;
                        return 0;
                } else {
-                       vcpu->arch.dr6 &= ~15;
+                       vcpu->arch.dr6 &= ~DR_TRAP_BITS;
                        vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        return 1;
@@ -4771,8 +4797,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->guest_debug == 0) {
-               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
-                               CPU_BASED_MOV_DR_EXITING);
+               exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
 
                /*
                 * No more DR vmexits; force a reload of the debug registers
@@ -4816,7 +4841,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
+       exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
 }
 
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -4876,8 +4901,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 
 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 {
-       vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
-                       CPU_BASED_VIRTUAL_INTR_PENDING);
+       exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -5131,8 +5155,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 static int handle_nmi_window(struct kvm_vcpu *vcpu)
 {
        WARN_ON_ONCE(!enable_vnmi);
-       vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
-                       CPU_BASED_VIRTUAL_NMI_PENDING);
+       exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
        ++vcpu->stat.nmi_window_exits;
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -5144,7 +5167,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        enum emulation_result err = EMULATE_DONE;
        int ret = 1;
-       u32 cpu_exec_ctrl;
        bool intr_window_requested;
        unsigned count = 130;
 
@@ -5155,8 +5177,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
         */
        WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
 
-       cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
+       intr_window_requested = exec_controls_get(vmx) &
+                               CPU_BASED_VIRTUAL_INTR_PENDING;
 
        while (vmx->emulation_required && count-- != 0) {
                if (intr_window_requested && vmx_interrupt_allowed(vcpu))
@@ -5342,7 +5364,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
         * is read even if it isn't needed (e.g., for type==all)
         */
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-                               vmx_instruction_info, false, &gva))
+                               vmx_instruction_info, false,
+                               sizeof(operand), &gva))
                return 1;
 
        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
@@ -5437,8 +5460,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 
 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 {
-       if (!to_vmx(vcpu)->req_immediate_exit)
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!vmx->req_immediate_exit &&
+           !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
                kvm_lapic_expired_hv_timer(vcpu);
+
        return 1;
 }
 
@@ -5469,7 +5496,7 @@ static int handle_encls(struct kvm_vcpu *vcpu)
  * to be done to userspace and return 0.
  */
 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
-       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
+       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
        [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
@@ -5952,6 +5979,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 
 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 sec_exec_control;
 
        if (!lapic_in_kernel(vcpu))
@@ -5963,11 +5991,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 
        /* Postpone execution until vmcs01 is the current VMCS. */
        if (is_guest_mode(vcpu)) {
-               to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
+               vmx->nested.change_vmcs01_virtual_apic_mode = true;
                return;
        }
 
-       sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+       sec_exec_control = secondary_exec_controls_get(vmx);
        sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 
@@ -5989,7 +6017,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
                                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
                break;
        }
-       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+       secondary_exec_controls_set(vmx, sec_exec_control);
 
        vmx_update_msr_bitmap(vcpu);
 }
@@ -6107,76 +6135,81 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
        memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 }
 
-static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
-       u32 exit_intr_info = 0;
-       u16 basic_exit_reason = (u16)vmx->exit_reason;
-
-       if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
-             || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
-               return;
-
-       if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
-               exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       vmx->exit_intr_info = exit_intr_info;
+       vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
        /* if exit due to PF check for async PF */
-       if (is_page_fault(exit_intr_info))
+       if (is_page_fault(vmx->exit_intr_info))
                vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 
        /* Handle machine checks before interrupts are enabled */
-       if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
-           is_machine_check(exit_intr_info))
+       if (is_machine_check(vmx->exit_intr_info))
                kvm_machine_check();
 
        /* We need to handle NMIs before interrupts are enabled */
-       if (is_nmi(exit_intr_info)) {
+       if (is_nmi(vmx->exit_intr_info)) {
                kvm_before_interrupt(&vmx->vcpu);
                asm("int $2");
                kvm_after_interrupt(&vmx->vcpu);
        }
 }
 
-static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 {
-       u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-       if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
-                       == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
-               unsigned int vector;
-               unsigned long entry;
-               gate_desc *desc;
-               struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned int vector;
+       unsigned long entry;
 #ifdef CONFIG_X86_64
-               unsigned long tmp;
+       unsigned long tmp;
 #endif
+       gate_desc *desc;
+       u32 intr_info;
+
+       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       if (WARN_ONCE(!is_external_intr(intr_info),
+           "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+               return;
 
-               vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
-               desc = (gate_desc *)vmx->host_idt_base + vector;
-               entry = gate_offset(desc);
-               asm volatile(
+       vector = intr_info & INTR_INFO_VECTOR_MASK;
+       desc = (gate_desc *)host_idt_base + vector;
+       entry = gate_offset(desc);
+
+       kvm_before_interrupt(vcpu);
+
+       asm volatile(
 #ifdef CONFIG_X86_64
-                       "mov %%" _ASM_SP ", %[sp]\n\t"
-                       "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
-                       "push $%c[ss]\n\t"
-                       "push %[sp]\n\t"
+               "mov %%" _ASM_SP ", %[sp]\n\t"
+               "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+               "push $%c[ss]\n\t"
+               "push %[sp]\n\t"
 #endif
-                       "pushf\n\t"
-                       __ASM_SIZE(push) " $%c[cs]\n\t"
-                       CALL_NOSPEC
-                       :
+               "pushf\n\t"
+               __ASM_SIZE(push) " $%c[cs]\n\t"
+               CALL_NOSPEC
+               :
 #ifdef CONFIG_X86_64
-                       [sp]"=&r"(tmp),
+               [sp]"=&r"(tmp),
 #endif
-                       ASM_CALL_CONSTRAINT
-                       :
-                       THUNK_TARGET(entry),
-                       [ss]"i"(__KERNEL_DS),
-                       [cs]"i"(__KERNEL_CS)
-                       );
-       }
+               ASM_CALL_CONSTRAINT
+               :
+               THUNK_TARGET(entry),
+               [ss]"i"(__KERNEL_DS),
+               [cs]"i"(__KERNEL_CS)
+       );
+
+       kvm_after_interrupt(vcpu);
+}
+STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
+
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+               handle_external_interrupt_irqoff(vcpu);
+       else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+               handle_exception_nmi_irqoff(vmx);
 }
-STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
 
 static bool vmx_has_emulated_msr(int index)
 {
@@ -6187,6 +6220,8 @@ static bool vmx_has_emulated_msr(int index)
                 * real mode.
                 */
                return enable_unrestricted_guest || emulate_invalid_guest_state;
+       case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+               return nested;
        case MSR_AMD64_VIRT_SPEC_CTRL:
                /* This is AMD only.  */
                return false;
@@ -6332,15 +6367,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host, false);
 }
 
-static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
-{
-       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
-       if (!vmx->loaded_vmcs->hv_timer_armed)
-               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
-                             PIN_BASED_VMX_PREEMPTION_TIMER);
-       vmx->loaded_vmcs->hv_timer_armed = true;
-}
-
 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6348,11 +6374,9 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
        u32 delta_tsc;
 
        if (vmx->req_immediate_exit) {
-               vmx_arm_hv_timer(vmx, 0);
-               return;
-       }
-
-       if (vmx->hv_deadline_tsc != -1) {
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+               vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+       } else if (vmx->hv_deadline_tsc != -1) {
                tscl = rdtsc();
                if (vmx->hv_deadline_tsc > tscl)
                        /* set_hv_timer ensures the delta fits in 32-bits */
@@ -6361,14 +6385,12 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
                else
                        delta_tsc = 0;
 
-               vmx_arm_hv_timer(vmx, delta_tsc);
-               return;
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+               vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+       } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
+               vmx->loaded_vmcs->hv_timer_soft_disabled = true;
        }
-
-       if (vmx->loaded_vmcs->hv_timer_armed)
-               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
-                               PIN_BASED_VMX_PREEMPTION_TIMER);
-       vmx->loaded_vmcs->hv_timer_armed = false;
 }
 
 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
@@ -6401,8 +6423,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
                vmcs_write32(PLE_WINDOW, vmx->ple_window);
        }
 
-       if (vmx->nested.need_vmcs12_sync)
-               nested_sync_from_vmcs12(vcpu);
+       if (vmx->nested.need_vmcs12_to_shadow_sync)
+               nested_sync_vmcs12_to_shadow(vcpu);
 
        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -6440,7 +6462,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        atomic_switch_perf_msrs(vmx);
 
-       vmx_update_hv_timer(vcpu);
+       if (enable_preemption_timer)
+               vmx_update_hv_timer(vcpu);
+
+       if (lapic_in_kernel(vcpu) &&
+               vcpu->arch.apic->lapic_timer.timer_advance_ns)
+               kvm_wait_lapic_expire(vcpu);
 
        /*
         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -6533,13 +6560,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->idt_vectoring_info = 0;
 
        vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+       if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+               kvm_machine_check();
+
        if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
                return;
 
        vmx->loaded_vmcs->launched = 1;
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
-       vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
 }
@@ -6630,6 +6659,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+       if (kvm_cstate_in_guest(kvm)) {
+               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+       }
        vmx->msr_bitmap_mode = 0;
 
        vmx->loaded_vmcs = &vmx->vmcs01;
@@ -6726,22 +6761,22 @@ static int vmx_vm_init(struct kvm *kvm)
        return 0;
 }
 
-static void __init vmx_check_processor_compat(void *rtn)
+static int __init vmx_check_processor_compat(void)
 {
        struct vmcs_config vmcs_conf;
        struct vmx_capability vmx_cap;
 
-       *(int *)rtn = 0;
        if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
-               *(int *)rtn = -EIO;
+               return -EIO;
        if (nested)
                nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
                                           enable_apicv);
        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
                                smp_processor_id());
-               *(int *)rtn = -EIO;
+               return -EIO;
        }
+       return 0;
 }
 
 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
@@ -6795,7 +6830,7 @@ static int vmx_get_lpage_level(void)
                return PT_PDPE_LEVEL;
 }
 
-static void vmcs_set_secondary_exec_control(u32 new_ctl)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
 {
        /*
         * These bits in the secondary execution controls field
@@ -6809,10 +6844,10 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_DESC;
 
-       u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+       u32 new_ctl = vmx->secondary_exec_control;
+       u32 cur_ctl = secondary_exec_controls_get(vmx);
 
-       vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-                    (new_ctl & ~mask) | (cur_ctl & mask));
+       secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
 }
 
 /*
@@ -6950,7 +6985,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
        if (cpu_has_secondary_exec_ctrls()) {
                vmx_compute_secondary_exec_control(vmx);
-               vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
+               vmcs_set_secondary_exec_control(vmx);
        }
 
        if (nested_vmx_allowed(vcpu))
@@ -7424,10 +7459,14 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
 static __init int hardware_setup(void)
 {
        unsigned long host_bndcfgs;
+       struct desc_ptr dt;
        int r, i;
 
        rdmsrl_safe(MSR_EFER, &host_efer);
 
+       store_idt(&dt);
+       host_idt_base = dt.address;
+
        for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
                kvm_define_shared_msr(i, vmx_msr_index[i]);
 
@@ -7531,17 +7570,33 @@ static __init int hardware_setup(void)
        }
 
        if (!cpu_has_vmx_preemption_timer())
-               kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+               enable_preemption_timer = false;
 
-       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+       if (enable_preemption_timer) {
+               u64 use_timer_freq = 5000ULL * 1000 * 1000;
                u64 vmx_msr;
 
                rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
                cpu_preemption_timer_multi =
                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
-       } else {
+
+               if (tsc_khz)
+                       use_timer_freq = (u64)tsc_khz * 1000;
+               use_timer_freq >>= cpu_preemption_timer_multi;
+
+               /*
+                * KVM "disables" the preemption timer by setting it to its max
+                * value.  Don't use the timer if it might cause spurious exits
+                * at a rate faster than 0.1 Hz (of uninterrupted guest time).
+                */
+               if (use_timer_freq > 0xffffffffu / 10)
+                       enable_preemption_timer = false;
+       }
+
+       if (!enable_preemption_timer) {
                kvm_x86_ops->set_hv_timer = NULL;
                kvm_x86_ops->cancel_hv_timer = NULL;
+               kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
        }
 
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
@@ -7683,7 +7738,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .set_tdp_cr3 = vmx_set_cr3,
 
        .check_intercept = vmx_check_intercept,
-       .handle_external_intr = vmx_handle_external_intr,
+       .handle_exit_irqoff = vmx_handle_exit_irqoff,
        .mpx_supported = vmx_mpx_supported,
        .xsaves_supported = vmx_xsaves_supported,
        .umip_emulated = vmx_umip_emulated,
index 61128b48c503f4ce67093d7d672e4fda141d5dc3..82d0bc3a4d52246f731be6c0e23163ebd05f9f34 100644 (file)
@@ -109,13 +109,20 @@ struct nested_vmx {
         * to guest memory during VM exit.
         */
        struct vmcs12 *cached_shadow_vmcs12;
+
        /*
         * Indicates if the shadow vmcs or enlightened vmcs must be updated
         * with the data held by struct vmcs12.
         */
-       bool need_vmcs12_sync;
+       bool need_vmcs12_to_shadow_sync;
        bool dirty_vmcs12;
 
+       /*
+        * Indicates lazily loaded guest state has not yet been decached from
+        * vmcs02.
+        */
+       bool need_sync_vmcs02_to_vmcs12_rare;
+
        /*
         * vmcs02 has been initialized, i.e. state that is constant for
         * vmcs02 has been written to the backing VMCS.  Initialization
@@ -180,14 +187,24 @@ struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        u8                    fail;
        u8                    msr_bitmap_mode;
+
+       /*
+        * If true, host state has been stored in vmx->loaded_vmcs for
+        * the CPU registers that only need to be switched when transitioning
+        * to/from the kernel, and the registers have been loaded with guest
+        * values.  If false, host state is loaded in the CPU registers
+        * and vmx->loaded_vmcs->host_state is invalid.
+        */
+       bool                  guest_state_loaded;
+
        u32                   exit_intr_info;
        u32                   idt_vectoring_info;
        ulong                 rflags;
+
        struct shared_msr_entry *guest_msrs;
        int                   nmsrs;
        int                   save_nmsrs;
-       bool                  guest_msrs_dirty;
-       unsigned long         host_idt_base;
+       bool                  guest_msrs_ready;
 #ifdef CONFIG_X86_64
        u64                   msr_host_kernel_gs_base;
        u64                   msr_guest_kernel_gs_base;
@@ -195,21 +212,15 @@ struct vcpu_vmx {
 
        u64                   spec_ctrl;
 
-       u32 vm_entry_controls_shadow;
-       u32 vm_exit_controls_shadow;
        u32 secondary_exec_control;
 
        /*
         * loaded_vmcs points to the VMCS currently used in this vcpu. For a
         * non-nested (L1) guest, it always points to vmcs01. For a nested
-        * guest (L2), it points to a different VMCS.  loaded_cpu_state points
-        * to the VMCS whose state is loaded into the CPU registers that only
-        * need to be switched when transitioning to/from the kernel; a NULL
-        * value indicates that host state is loaded.
+        * guest (L2), it points to a different VMCS.
         */
        struct loaded_vmcs    vmcs01;
        struct loaded_vmcs   *loaded_vmcs;
-       struct loaded_vmcs   *loaded_cpu_state;
 
        struct msr_autoload {
                struct vmx_msrs guest;
@@ -260,8 +271,6 @@ struct vcpu_vmx {
 
        unsigned long host_debugctlmsr;
 
-       u64 msr_ia32_power_ctl;
-
        /*
         * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
         * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@ -292,12 +301,14 @@ struct kvm_vmx {
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void vmx_vcpu_put(struct kvm_vcpu *vcpu);
 int allocate_vpid(void);
 void free_vpid(int vpid);
 void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+                       unsigned long fs_base, unsigned long gs_base);
 int vmx_get_cpl(struct kvm_vcpu *vcpu);
 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -376,69 +387,31 @@ static inline u8 vmx_get_rvi(void)
        return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
 }
 
-static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
-{
-       vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
-}
-
-static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
-{
-       vmcs_write32(VM_ENTRY_CONTROLS, val);
-       vmx->vm_entry_controls_shadow = val;
-}
-
-static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
-{
-       if (vmx->vm_entry_controls_shadow != val)
-               vm_entry_controls_init(vmx, val);
-}
-
-static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
-{
-       return vmx->vm_entry_controls_shadow;
-}
-
-static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
-{
-       vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
-}
-
-static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
-{
-       vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
-}
-
-static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
-{
-       vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
-}
-
-static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
-{
-       vmcs_write32(VM_EXIT_CONTROLS, val);
-       vmx->vm_exit_controls_shadow = val;
-}
-
-static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
-{
-       if (vmx->vm_exit_controls_shadow != val)
-               vm_exit_controls_init(vmx, val);
-}
-
-static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
-{
-       return vmx->vm_exit_controls_shadow;
-}
-
-static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
-{
-       vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
-}
-
-static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
-{
-       vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
+#define BUILD_CONTROLS_SHADOW(lname, uname)                                \
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val)     \
+{                                                                          \
+       if (vmx->loaded_vmcs->controls_shadow.lname != val) {               \
+               vmcs_write32(uname, val);                                   \
+               vmx->loaded_vmcs->controls_shadow.lname = val;              \
+       }                                                                   \
+}                                                                          \
+static inline u32 lname##_controls_get(struct vcpu_vmx *vmx)               \
+{                                                                          \
+       return vmx->loaded_vmcs->controls_shadow.lname;                     \
+}                                                                          \
+static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val)   \
+{                                                                          \
+       lname##_controls_set(vmx, lname##_controls_get(vmx) | val);         \
+}                                                                          \
+static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \
+{                                                                          \
+       lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val);        \
 }
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
 
 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 {
@@ -468,6 +441,7 @@ static inline u32 vmx_vmexit_ctrl(void)
 }
 
 u32 vmx_exec_control(struct vcpu_vmx *vmx);
+u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx);
 
 static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
 {
index 9857992d4e5885565c696f3ca5c4006ccd541a49..2c323118f0b38596bbc751497c14788de0955030 100644 (file)
@@ -716,7 +716,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
        gfn_t gfn;
        int r;
 
-       if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
+       if (!is_pae_paging(vcpu))
                return false;
 
        if (!test_bit(VCPU_EXREG_PDPTR,
@@ -959,8 +959,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (is_long_mode(vcpu) &&
            (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
                return 1;
-       else if (is_pae(vcpu) && is_paging(vcpu) &&
-                  !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+       else if (is_pae_paging(vcpu) &&
+                !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                return 1;
 
        kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
@@ -1173,7 +1173,28 @@ static u32 emulated_msrs[] = {
        MSR_AMD64_VIRT_SPEC_CTRL,
        MSR_IA32_POWER_CTL,
 
+       /*
+        * The following list leaves out MSRs whose values are determined
+        * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
+        * We always support the "true" VMX control MSRs, even if the host
+        * processor does not, so I am putting these registers here rather
+        * than in msrs_to_save.
+        */
+       MSR_IA32_VMX_BASIC,
+       MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+       MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+       MSR_IA32_VMX_TRUE_EXIT_CTLS,
+       MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+       MSR_IA32_VMX_MISC,
+       MSR_IA32_VMX_CR0_FIXED0,
+       MSR_IA32_VMX_CR4_FIXED0,
+       MSR_IA32_VMX_VMCS_ENUM,
+       MSR_IA32_VMX_PROCBASED_CTLS2,
+       MSR_IA32_VMX_EPT_VPID_CAP,
+       MSR_IA32_VMX_VMFUNC,
+
        MSR_K7_HWCR,
+       MSR_KVM_POLL_CONTROL,
 };
 
 static unsigned num_emulated_msrs;
@@ -1209,11 +1230,12 @@ static u32 msr_based_features[] = {
 
 static unsigned int num_msr_based_features;
 
-u64 kvm_get_arch_capabilities(void)
+static u64 kvm_get_arch_capabilities(void)
 {
-       u64 data;
+       u64 data = 0;
 
-       rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
 
        /*
         * If we're doing cache flushes (either "always" or "cond")
@@ -1229,7 +1251,6 @@ u64 kvm_get_arch_capabilities(void)
 
        return data;
 }
-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
 
 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
 {
@@ -2544,13 +2565,24 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                }
                break;
        case MSR_IA32_MISC_ENABLE:
-               vcpu->arch.ia32_misc_enable_msr = data;
+               if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
+                   ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
+                               return 1;
+                       vcpu->arch.ia32_misc_enable_msr = data;
+                       kvm_update_cpuid(vcpu);
+               } else {
+                       vcpu->arch.ia32_misc_enable_msr = data;
+               }
                break;
        case MSR_IA32_SMBASE:
                if (!msr_info->host_initiated)
                        return 1;
                vcpu->arch.smbase = data;
                break;
+       case MSR_IA32_POWER_CTL:
+               vcpu->arch.msr_ia32_power_ctl = data;
+               break;
        case MSR_IA32_TSC:
                kvm_write_tsc(vcpu, msr_info);
                break;
@@ -2625,6 +2657,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                break;
 
+       case MSR_KVM_POLL_CONTROL:
+               /* only enable bit supported */
+               if (data & (-1ULL << 1))
+                       return 1;
+
+               vcpu->arch.msr_kvm_poll_control = data;
+               break;
+
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
@@ -2802,6 +2842,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                msr_info->data = vcpu->arch.arch_capabilities;
                break;
+       case MSR_IA32_POWER_CTL:
+               msr_info->data = vcpu->arch.msr_ia32_power_ctl;
+               break;
        case MSR_IA32_TSC:
                msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
                break;
@@ -2874,6 +2917,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_KVM_PV_EOI_EN:
                msr_info->data = vcpu->arch.pv_eoi.msr_val;
                break;
+       case MSR_KVM_POLL_CONTROL:
+               msr_info->data = vcpu->arch.msr_kvm_poll_control;
+               break;
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
        case MSR_IA32_MCG_CAP:
@@ -3083,6 +3129,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SET_BOOT_CPU_ID:
        case KVM_CAP_SPLIT_IRQCHIP:
        case KVM_CAP_IMMEDIATE_EXIT:
+       case KVM_CAP_PMU_EVENT_FILTER:
        case KVM_CAP_GET_MSR_FEATURES:
        case KVM_CAP_MSR_PLATFORM_INFO:
        case KVM_CAP_EXCEPTION_PAYLOAD:
@@ -3095,7 +3142,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = KVM_CLOCK_TSC_STABLE;
                break;
        case KVM_CAP_X86_DISABLE_EXITS:
-               r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
+               r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
+                     KVM_X86_DISABLE_EXITS_CSTATE;
                if(kvm_can_mwait_in_guest())
                        r |= KVM_X86_DISABLE_EXITS_MWAIT;
                break;
@@ -4612,6 +4660,8 @@ split_irqchip_unlock:
                        kvm->arch.hlt_in_guest = true;
                if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
                        kvm->arch.pause_in_guest = true;
+               if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+                       kvm->arch.cstate_in_guest = true;
                r = 0;
                break;
        case KVM_CAP_MSR_PLATFORM_INFO:
@@ -4926,6 +4976,9 @@ set_identity_unlock:
                r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
                break;
        }
+       case KVM_SET_PMU_EVENT_FILTER:
+               r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
+               break;
        default:
                r = -ENOTTY;
        }
@@ -6378,7 +6431,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
                                           vcpu->arch.db);
 
                if (dr6 != 0) {
-                       vcpu->arch.dr6 &= ~15;
+                       vcpu->arch.dr6 &= ~DR_TRAP_BITS;
                        vcpu->arch.dr6 |= dr6 | DR6_RTM;
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        *r = EMULATE_DONE;
@@ -6705,7 +6758,7 @@ static void kvm_hyperv_tsc_notifier(void)
        struct kvm_vcpu *vcpu;
        int cpu;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_make_mclock_inprogress_request(kvm);
 
@@ -6731,7 +6784,7 @@ static void kvm_hyperv_tsc_notifier(void)
 
                spin_unlock(&ka->pvclock_gtod_sync_lock);
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 }
 #endif
 
@@ -6782,17 +6835,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
 
        smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != cpu)
                                continue;
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-                       if (vcpu->cpu != smp_processor_id())
+                       if (vcpu->cpu != raw_smp_processor_id())
                                send_ipi = 1;
                }
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        if (freq->old < freq->new && send_ipi) {
                /*
@@ -6907,35 +6960,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
        .handle_intel_pt_intr   = kvm_handle_intel_pt_intr,
 };
 
-static void kvm_set_mmio_spte_mask(void)
-{
-       u64 mask;
-       int maxphyaddr = boot_cpu_data.x86_phys_bits;
-
-       /*
-        * Set the reserved bits and the present bit of an paging-structure
-        * entry to generate page fault with PFER.RSV = 1.
-        */
-
-       /*
-        * Mask the uppermost physical address bit, which would be reserved as
-        * long as the supported physical address width is less than 52.
-        */
-       mask = 1ull << 51;
-
-       /* Set the present bit. */
-       mask |= 1ull;
-
-       /*
-        * If reserved bit is not supported, clear the present bit to disable
-        * mmio page fault.
-        */
-       if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52)
-               mask &= ~1ull;
-
-       kvm_mmu_set_mmio_spte_mask(mask, mask);
-}
-
 #ifdef CONFIG_X86_64
 static void pvclock_gtod_update_fn(struct work_struct *work)
 {
@@ -6944,12 +6968,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
        struct kvm_vcpu *vcpu;
        int i;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
        atomic_set(&kvm_guest_has_master_clock, 0);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 }
 
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -7032,8 +7056,6 @@ int kvm_arch_init(void *opaque)
        if (r)
                goto out_free_percpu;
 
-       kvm_set_mmio_spte_mask();
-
        kvm_x86_ops = ops;
 
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
@@ -7172,6 +7194,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
        kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
 }
 
+static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+{
+       struct kvm_vcpu *target = NULL;
+       struct kvm_apic_map *map;
+
+       rcu_read_lock();
+       map = rcu_dereference(kvm->arch.apic_map);
+
+       if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
+               target = map->phys_map[dest_id]->vcpu;
+
+       rcu_read_unlock();
+
+       if (target)
+               kvm_vcpu_yield_to(target);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -7218,6 +7257,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        case KVM_HC_SEND_IPI:
                ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
                break;
+       case KVM_HC_SCHED_YIELD:
+               kvm_sched_yield(vcpu->kvm, a0);
+               ret = 0;
+               break;
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -7950,9 +7993,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
-       if (lapic_in_kernel(vcpu) &&
-           vcpu->arch.apic->lapic_timer.timer_advance_ns)
-               wait_lapic_expire(vcpu);
        guest_enter_irqoff();
 
        fpregs_assert_state_consistent();
@@ -8001,13 +8041,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
 
-       kvm_before_interrupt(vcpu);
-       kvm_x86_ops->handle_external_intr(vcpu);
-       kvm_after_interrupt(vcpu);
+       kvm_x86_ops->handle_exit_irqoff(vcpu);
 
+       /*
+        * Consume any pending interrupts, including the possible source of
+        * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
+        * An instruction is required after local_irq_enable() to fully unblock
+        * interrupts on processors that implement an interrupt shadow, the
+        * stat.exits increment will do nicely.
+        */
+       kvm_before_interrupt(vcpu);
+       local_irq_enable();
        ++vcpu->stat.exits;
+       local_irq_disable();
+       kvm_after_interrupt(vcpu);
 
        guest_exit_irqoff();
+       if (lapic_in_kernel(vcpu)) {
+               s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
+               if (delta != S64_MIN) {
+                       trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
+                       vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
+               }
+       }
 
        local_irq_enable();
        preempt_enable();
@@ -8593,7 +8649,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
                kvm_update_cpuid(vcpu);
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
+       if (is_pae_paging(vcpu)) {
                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                mmu_reset_needed = 1;
        }
@@ -8874,6 +8930,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        msr.host_initiated = true;
        kvm_write_tsc(vcpu, &msr);
        vcpu_put(vcpu);
+
+       /* poll control enabled by default */
+       vcpu->arch.msr_kvm_poll_control = 1;
+
        mutex_unlock(&vcpu->mutex);
 
        if (!kvmclock_periodic_sync)
@@ -9106,9 +9166,9 @@ void kvm_arch_hardware_unsetup(void)
        kvm_x86_ops->hardware_unsetup();
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-       kvm_x86_ops->check_processor_compatibility(rtn);
+       return kvm_x86_ops->check_processor_compatibility();
 }
 
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
@@ -9380,6 +9440,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_ioapic_destroy(kvm);
        kvm_free_vcpus(kvm);
        kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
        kvm_mmu_uninit_vm(kvm);
        kvm_page_track_cleanup(kvm);
        kvm_hv_destroy_vm(kvm);
@@ -9788,6 +9849,36 @@ static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
                                      sizeof(u32));
 }
 
+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+               return false;
+
+       if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+           (vcpu->arch.apf.send_user_only &&
+            kvm_x86_ops->get_cpl(vcpu) == 0))
+               return false;
+
+       return true;
+}
+
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!lapic_in_kernel(vcpu) ||
+                    kvm_event_needs_reinjection(vcpu) ||
+                    vcpu->arch.exception.pending))
+               return false;
+
+       if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
+               return false;
+
+       /*
+        * If interrupts are off we cannot even use an artificial
+        * halt state.
+        */
+       return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
                                     struct kvm_async_pf *work)
 {
@@ -9796,11 +9887,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
        trace_kvm_async_pf_not_present(work->arch.token, work->gva);
        kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
 
-       if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
-           (vcpu->arch.apf.send_user_only &&
-            kvm_x86_ops->get_cpl(vcpu) == 0))
-               kvm_make_request(KVM_REQ_APF_HALT, vcpu);
-       else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+       if (kvm_can_deliver_async_pf(vcpu) &&
+           !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
                fault.vector = PF_VECTOR;
                fault.error_code_valid = true;
                fault.error_code = 0;
@@ -9808,6 +9896,16 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
                fault.address = work->arch.token;
                fault.async_page_fault = true;
                kvm_inject_page_fault(vcpu, &fault);
+       } else {
+               /*
+                * It is not possible to deliver a paravirtualized asynchronous
+                * page fault, but putting the guest in an artificial halt state
+                * can be beneficial nevertheless: if an interrupt arrives, we
+                * can deliver it timely and perhaps the guest will schedule
+                * another process.  When the instruction that triggered a page
+                * fault is retried, hopefully the page will be ready in the host.
+                */
+               kvm_make_request(KVM_REQ_APF_HALT, vcpu);
        }
 }
 
@@ -9948,6 +10046,13 @@ bool kvm_vector_hashing_enabled(void)
 }
 EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
 
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
+{
+       return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
index a470ff0868c58e1d6f8ceb88dac3a6e5bd0f0a1b..e08a12892e8ba8275addbc9f9d8c98f4fa8051df 100644 (file)
@@ -139,6 +139,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
        return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
 }
 
+static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
+{
+       return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
+}
+
 static inline u32 bit(int bitno)
 {
        return 1 << (bitno & 31);
@@ -333,6 +338,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm)
        return kvm->arch.pause_in_guest;
 }
 
+static inline bool kvm_cstate_in_guest(struct kvm *kvm)
+{
+       return kvm->arch.cstate_in_guest;
+}
+
 DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 
 static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
index d1ad38a3f048b15f3501f24bb838f75081efcdd4..c5da875f19e372b9a577fc7d6741f1dbd00a5a3c 100644 (file)
@@ -159,7 +159,7 @@ static inline bool is_error_page(struct page *page)
 
 extern struct kmem_cache *kvm_vcpu_cache;
 
-extern spinlock_t kvm_lock;
+extern struct mutex kvm_lock;
 extern struct list_head vm_list;
 
 struct kvm_io_range {
@@ -867,7 +867,7 @@ int kvm_arch_hardware_enable(void);
 void kvm_arch_hardware_disable(void);
 int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
-void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_check_processor_compat(void);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
@@ -990,6 +990,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
index 2fe12b40d5035a7b459720476e7ab7002b04c9d4..a7c19540ce21e6410ed3055cb5a27419a8f5d9db 100644 (file)
@@ -696,9 +696,11 @@ struct kvm_ioeventfd {
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
+#define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
 #define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
                                               KVM_X86_DISABLE_EXITS_HLT | \
-                                              KVM_X86_DISABLE_EXITS_PAUSE)
+                                              KVM_X86_DISABLE_EXITS_PAUSE | \
+                                              KVM_X86_DISABLE_EXITS_CSTATE)
 
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
@@ -993,6 +995,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_SVE 170
 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172
+#define KVM_CAP_PMU_EVENT_FILTER 173
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1327,6 +1330,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_RMMU_INFO    _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
 /* Available with KVM_CAP_PPC_GET_CPU_CHAR */
 #define KVM_PPC_GET_CPU_CHAR     _IOR(KVMIO,  0xb1, struct kvm_ppc_cpu_char)
+/* Available with KVM_CAP_PMU_EVENT_FILTER */
+#define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE        _IOWR(KVMIO,  0xe0, struct kvm_create_device)
index 6c0ce49931e500da0759804d9c54e1ed4687fc5e..8b86609849b9fa2571f840cf904265f0cd95ca11 100644 (file)
@@ -28,6 +28,7 @@
 #define KVM_HC_MIPS_CONSOLE_OUTPUT     8
 #define KVM_HC_CLOCK_PAIRING           9
 #define KVM_HC_SEND_IPI                10
+#define KVM_HC_SCHED_YIELD             11
 
 /*
  * hypercalls use architecture specific
index 2fe12b40d5035a7b459720476e7ab7002b04c9d4..c2152f3dd02d41a559f8d62221ea34313ca93277 100644 (file)
@@ -696,9 +696,11 @@ struct kvm_ioeventfd {
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
+#define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
 #define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
                                               KVM_X86_DISABLE_EXITS_HLT | \
-                                              KVM_X86_DISABLE_EXITS_PAUSE)
+                                              KVM_X86_DISABLE_EXITS_PAUSE | \
+                                              KVM_X86_DISABLE_EXITS_CSTATE)
 
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
index fc27f890155baf9308dd38e641ea35605870719d..ceb52b9526375de63cadef26256a110d5e5637b8 100644 (file)
@@ -121,7 +121,6 @@ static void *vcpu_worker(void *data)
        uint64_t *guest_array;
        uint64_t pages_count = 0;
        struct kvm_run *run;
-       struct ucall uc;
 
        run = vcpu_state(vm, VCPU_ID);
 
@@ -132,7 +131,7 @@ static void *vcpu_worker(void *data)
                /* Let the guest dirty the random pages */
                ret = _vcpu_run(vm, VCPU_ID);
                TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
-               if (get_ucall(vm, VCPU_ID, &uc) == UCALL_SYNC) {
+               if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
                        pages_count += TEST_PAGES_PER_LOOP;
                        generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
                } else {
index 9ef2ab1a0c0874276893537334c00cb0f7a513ea..b7fa0c8551db4b2d84dad3fca42572d299e5cdd6 100644 (file)
@@ -52,4 +52,8 @@ static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint
        vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, &reg);
 }
 
+void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init);
+void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid,
+                             struct kvm_vcpu_init *init, void *guest_code);
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
index 7318fb054ae9673924ce20295f01999c9407ecec..00235f5932f04b5b9a11d396168a4cf8418539e5 100644 (file)
@@ -86,8 +86,7 @@ int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
                void *arg);
 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot,
-                int gdt_memslot);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
                          uint32_t data_memslot, uint32_t pgd_memslot);
 void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
index 19e667911496cd42bb8a513fd122706e0f6c3d17..af2023d818a5a9dcaee4b08367d2b0557d440048 100644 (file)
@@ -235,28 +235,21 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
        return vm;
 }
 
-void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init)
 {
-       size_t stack_size = vm->page_size == 4096 ?
-                                       DEFAULT_STACK_PGS * vm->page_size :
-                                       vm->page_size;
-       uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
-                                       DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0);
+       struct kvm_vcpu_init default_init = { .target = -1, };
+       uint64_t sctlr_el1, tcr_el1;
 
-       vm_vcpu_add(vm, vcpuid, 0, 0);
+       if (!init)
+               init = &default_init;
 
-       set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
-       set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
-}
-
-void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
-{
-       struct kvm_vcpu_init init;
-       uint64_t sctlr_el1, tcr_el1;
+       if (init->target == -1) {
+               struct kvm_vcpu_init preferred;
+               vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &preferred);
+               init->target = preferred.target;
+       }
 
-       memset(&init, 0, sizeof(init));
-       init.target = KVM_ARM_TARGET_GENERIC_V8;
-       vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, &init);
+       vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init);
 
        /*
         * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15
@@ -316,3 +309,24 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
        fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n",
                indent, "", pstate, pc);
 }
+
+void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid,
+                             struct kvm_vcpu_init *init, void *guest_code)
+{
+       size_t stack_size = vm->page_size == 4096 ?
+                                       DEFAULT_STACK_PGS * vm->page_size :
+                                       vm->page_size;
+       uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+                                       DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0);
+
+       vm_vcpu_add(vm, vcpuid);
+       aarch64_vcpu_setup(vm, vcpuid, init);
+
+       set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
+       set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+       aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code);
+}
index ee864fa07d8ec4057e721e409c48f15abac90aef..221e3fa4668024843e183549d1acecabfd2076d9 100644 (file)
@@ -763,11 +763,10 @@ static int vcpu_mmap_sz(void)
  *
  * Return: None
  *
- * Creates and adds to the VM specified by vm and virtual CPU with
- * the ID given by vcpuid.
+ * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid.
+ * No additional VCPU setup is done.
  */
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot,
-                int gdt_memslot)
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
 {
        struct vcpu *vcpu;
 
@@ -801,8 +800,6 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot,
                vm->vcpu_head->prev = vcpu;
        vcpu->next = vm->vcpu_head;
        vm->vcpu_head = vcpu;
-
-       vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot);
 }
 
 /*
index 265b7822f591613f4c9ffe218ee0b288b33dbb33..f36262e0f655335677c445f37486c69f3b8c19b8 100644 (file)
@@ -64,8 +64,6 @@ struct kvm_vm {
 };
 
 struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);
-void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot,
-               int gdt_memslot);
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
 void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent);
 void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent);
index b701a01cfcb62c619860783fb387c177d352bcbd..dd9a66700f96e5335ffd78128409ac9e49350498 100644 (file)
@@ -125,16 +125,16 @@ void ucall(uint64_t cmd, int nargs, ...)
 uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
 {
        struct kvm_run *run = vcpu_state(vm, vcpu_id);
-
-       memset(uc, 0, sizeof(*uc));
+       struct ucall ucall = {};
+       bool got_ucall = false;
 
 #ifdef __x86_64__
        if (ucall_type == UCALL_PIO && run->exit_reason == KVM_EXIT_IO &&
            run->io.port == UCALL_PIO_PORT) {
                struct kvm_regs regs;
                vcpu_regs_get(vm, vcpu_id, &regs);
-               memcpy(uc, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), sizeof(*uc));
-               return uc->cmd;
+               memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), sizeof(ucall));
+               got_ucall = true;
        }
 #endif
        if (ucall_type == UCALL_MMIO && run->exit_reason == KVM_EXIT_MMIO &&
@@ -143,8 +143,15 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
                TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8,
                            "Unexpected ucall exit mmio address access");
                memcpy(&gva, run->mmio.data, sizeof(gva));
-               memcpy(uc, addr_gva2hva(vm, gva), sizeof(*uc));
+               memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall));
+               got_ucall = true;
+       }
+
+       if (got_ucall) {
+               vcpu_run_complete_io(vm, vcpu_id);
+               if (uc)
+                       memcpy(uc, &ucall, sizeof(ucall));
        }
 
-       return uc->cmd;
+       return ucall.cmd;
 }
index d2ad85fb01ac07d9d3a13c5cbdaa6909e6f7ec32..b430f962e32367270ab11ff23fa11dd178b8bee5 100644 (file)
@@ -609,7 +609,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
        kvm_seg_fill_gdt_64bit(vm, segp);
 }
 
-void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
+static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
 {
        struct kvm_sregs sregs;
 
@@ -655,7 +655,8 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
                                     DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
 
        /* Create VCPU */
-       vm_vcpu_add(vm, vcpuid, 0, 0);
+       vm_vcpu_add(vm, vcpuid);
+       vcpu_setup(vm, vcpuid, 0, 0);
 
        /* Setup guest general purpose registers */
        vcpu_regs_get(vm, vcpuid, &regs);
index b38260e297751e404847222d1018c5e2f073e034..dbf82658f2ef2ffc43204501adad0d3750d14201 100644 (file)
@@ -144,7 +144,7 @@ int main(int argc, char *argv[])
 
                /* Restore state in a new VM.  */
                kvm_vm_restart(vm, O_RDWR);
-               vm_vcpu_add(vm, VCPU_ID, 0, 0);
+               vm_vcpu_add(vm, VCPU_ID);
                vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
                vcpu_load_state(vm, VCPU_ID, state);
                run = vcpu_state(vm, VCPU_ID);
index 6a3eec8da351cc9dfed7c4d8fbdb6ad620fbc16f..429226bc6a928392f043f5aeda3a3ed8c11830aa 100644 (file)
@@ -33,7 +33,7 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus)
                int vcpu_id = first_vcpu_id + i;
 
                /* This asserts that the vCPU was created. */
-               vm_vcpu_add(vm, vcpu_id, 0, 0);
+               vm_vcpu_add(vm, vcpu_id);
        }
 
        kvm_vm_free(vm);
index 4daf520bada1d774568e32908c6b7d1f2a0c2311..8c063646f2a00dd293f8fb944615ca870fad7fcf 100644 (file)
@@ -144,7 +144,7 @@ int main(int argc, char *argv[])
                state = vcpu_save_state(vm, VCPU_ID);
                kvm_vm_release(vm);
                kvm_vm_restart(vm, O_RDWR);
-               vm_vcpu_add(vm, VCPU_ID, 0, 0);
+               vm_vcpu_add(vm, VCPU_ID);
                vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
                vcpu_load_state(vm, VCPU_ID, state);
                run = vcpu_state(vm, VCPU_ID);
index 1a23617f34d9557328e4572ad3a960570a6f2461..3ab5ec3da9f42eaadf6efd12b4bb63f4b69e2487 100644 (file)
@@ -176,7 +176,7 @@ int main(int argc, char *argv[])
 
                /* Restore state in a new VM.  */
                kvm_vm_restart(vm, O_RDWR);
-               vm_vcpu_add(vm, VCPU_ID, 0, 0);
+               vm_vcpu_add(vm, VCPU_ID);
                vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
                vcpu_load_state(vm, VCPU_ID, state);
                run = vcpu_state(vm, VCPU_ID);
index f149c79fd6ef63643c103a00ee2a5ad7a088773a..f645c0fbf7ecef90aa7007709b90acf05756e558 100644 (file)
@@ -93,9 +93,9 @@ int kvm_arch_hardware_setup(void)
        return 0;
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-       *(int *)rtn = 0;
+       return 0;
 }
 
 
index 2e6fc7c66a11311b02c8f7d034b636195e4acce3..58e4f88b2b9fb4ecd0765f6e653f8dc14b5e0423 100644 (file)
@@ -184,9 +184,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
        nr_rt_entries += 1;
 
-       new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
-                     GFP_KERNEL_ACCOUNT);
-
+       new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT);
        if (!new)
                return -ENOMEM;
 
index 2f2d24a4dd5c2e2ef18d9bbc5f2468a627af6ced..b4ab59dd6846003cd5bfbfa38ca708880af6cdf1 100644 (file)
@@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  *     kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_MUTEX(kvm_lock);
 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 LIST_HEAD(vm_list);
 
@@ -680,9 +680,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
        if (r)
                goto out_err;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        preempt_notifier_inc();
 
@@ -728,9 +728,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
        kvm_destroy_vm_debugfs(kvm);
        kvm_arch_sync_events(kvm);
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_del(&kvm->vm_list);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++) {
                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -1790,7 +1790,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
        if (!map->hva)
                return;
 
-       if (map->page)
+       if (map->page != KVM_UNMAPPED_PAGE)
                kunmap(map->page);
 #ifdef CONFIG_HAS_IOMEM
        else
@@ -4031,13 +4031,13 @@ static int vm_stat_get(void *_offset, u64 *val)
        u64 tmp_val;
 
        *val = 0;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
                *val += tmp_val;
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return 0;
 }
 
@@ -4050,12 +4050,12 @@ static int vm_stat_clear(void *_offset, u64 val)
        if (val)
                return -EINVAL;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vm_stat_clear_per_vm((void *)&stat_tmp, 0);
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        return 0;
 }
@@ -4070,13 +4070,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
        u64 tmp_val;
 
        *val = 0;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
                *val += tmp_val;
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return 0;
 }
 
@@ -4089,12 +4089,12 @@ static int vcpu_stat_clear(void *_offset, u64 val)
        if (val)
                return -EINVAL;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        return 0;
 }
@@ -4115,7 +4115,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
        if (!kvm_dev.this_device || !kvm)
                return;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        if (type == KVM_EVENT_CREATE_VM) {
                kvm_createvm_count++;
                kvm_active_vms++;
@@ -4124,7 +4124,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
        }
        created = kvm_createvm_count;
        active = kvm_active_vms;
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
        if (!env)
@@ -4221,6 +4221,11 @@ static void kvm_sched_out(struct preempt_notifier *pn,
        kvm_arch_vcpu_put(vcpu);
 }
 
+static void check_processor_compat(void *rtn)
+{
+       *(int *)rtn = kvm_arch_check_processor_compat();
+}
+
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                  struct module *module)
 {
@@ -4252,9 +4257,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                goto out_free_0a;
 
        for_each_online_cpu(cpu) {
-               smp_call_function_single(cpu,
-                               kvm_arch_check_processor_compat,
-                               &r, 1);
+               smp_call_function_single(cpu, check_processor_compat, &r, 1);
                if (r < 0)
                        goto out_free_1;
        }