Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[sfrench/cifs-2.6.git] / arch / x86 / kvm / vmx / vmx.c
index be7c19374fdd94d7347ea4c83588e9454e164530..d7f8331d6f7e72c0b90d3b6596b1374bd69b374c 100644 (file)
@@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO);
 
 module_param(enable_apicv, bool, S_IRUGO);
 
+bool __read_mostly enable_ipiv = true;
+module_param(enable_ipiv, bool, 0444);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO);
 bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+
 static bool __read_mostly dump_invalid_vmcs = 0;
 module_param(dump_invalid_vmcs, bool, 0644);
 
@@ -443,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault)
 
 noinline void vmwrite_error(unsigned long field, unsigned long value)
 {
-       vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
+       vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 }
 
 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
 {
-       vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
+       vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+                       vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
 }
 
 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
 {
-       vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
+       vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+                       vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
 }
 
 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
@@ -1787,7 +1795,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
            nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
                return vmcs12->tsc_multiplier;
 
-       return kvm_default_tsc_scaling_ratio;
+       return kvm_caps.default_tsc_scaling_ratio;
 }
 
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -2111,6 +2119,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
                    (data & MSR_IA32_BNDCFGS_RSVD))
                        return 1;
+
+               if (is_guest_mode(vcpu) &&
+                   ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
+                    (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
+                       get_vmcs12(vcpu)->guest_bndcfgs = data;
+
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
        case MSR_IA32_UMWAIT_CONTROL:
@@ -2312,7 +2326,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        if ((data & PMU_CAP_LBR_FMT) !=
                            (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
                                return 1;
-                       if (!intel_pmu_lbr_is_compatible(vcpu))
+                       if (!cpuid_model_is_consistent(vcpu))
+                               return 1;
+               }
+               if (data & PERF_CAP_PEBS_FORMAT) {
+                       if ((data & PERF_CAP_PEBS_MASK) !=
+                           (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+                               return 1;
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+                               return 1;
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+                               return 1;
+                       if (!cpuid_model_is_consistent(vcpu))
                                return 1;
                }
                ret = kvm_set_msr_common(vcpu, msr_info);
@@ -2489,6 +2514,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
        return 0;
 }
 
+static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+       u64 allowed;
+
+       rdmsrl(msr, allowed);
+
+       return  ctl_opt & allowed;
+}
+
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                                    struct vmx_capability *vmx_cap)
 {
@@ -2497,8 +2531,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
        u32 _pin_based_exec_control = 0;
        u32 _cpu_based_exec_control = 0;
        u32 _cpu_based_2nd_exec_control = 0;
+       u64 _cpu_based_3rd_exec_control = 0;
        u32 _vmexit_control = 0;
        u32 _vmentry_control = 0;
+       int i;
+
+       /*
+        * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+        * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+        * intercepts writes to PAT and EFER, i.e. never enables those controls.
+        */
+       struct {
+               u32 entry_control;
+               u32 exit_control;
+       } const vmcs_entry_exit_pairs[] = {
+               { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+               { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
+               { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
+               { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
+               { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
+       };
 
        memset(vmcs_conf, 0, sizeof(*vmcs_conf));
        min = CPU_BASED_HLT_EXITING |
@@ -2518,7 +2570,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 
        opt = CPU_BASED_TPR_SHADOW |
              CPU_BASED_USE_MSR_BITMAPS |
-             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS |
+             CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
                                &_cpu_based_exec_control) < 0)
                return -EIO;
@@ -2551,7 +2604,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                        SECONDARY_EXEC_PT_USE_GPA |
                        SECONDARY_EXEC_PT_CONCEAL_VMX |
                        SECONDARY_EXEC_ENABLE_VMFUNC |
-                       SECONDARY_EXEC_BUS_LOCK_DETECTION;
+                       SECONDARY_EXEC_BUS_LOCK_DETECTION |
+                       SECONDARY_EXEC_NOTIFY_VM_EXITING;
                if (cpu_has_sgx())
                        opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
                if (adjust_vmx_controls(min2, opt2,
@@ -2581,15 +2635,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                                             CPU_BASED_CR3_STORE_EXITING |
                                             CPU_BASED_INVLPG_EXITING);
        } else if (vmx_cap->ept) {
-               vmx_cap->ept = 0;
                pr_warn_once("EPT CAP should not exist if not support "
                                "1-setting enable EPT VM-execution control\n");
+
+               if (error_on_inconsistent_vmcs_config)
+                       return -EIO;
+
+               vmx_cap->ept = 0;
        }
        if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
-               vmx_cap->vpid) {
-               vmx_cap->vpid = 0;
+           vmx_cap->vpid) {
                pr_warn_once("VPID CAP should not exist if not support "
                                "1-setting enable VPID VM-execution control\n");
+
+               if (error_on_inconsistent_vmcs_config)
+                       return -EIO;
+
+               vmx_cap->vpid = 0;
+       }
+
+       if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
+               u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
+
+               _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
+                                             MSR_IA32_VMX_PROCBASED_CTLS3);
        }
 
        min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -2630,6 +2699,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                                &_vmentry_control) < 0)
                return -EIO;
 
+       for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
+               u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
+               u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
+
+               if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
+                       continue;
+
+               pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
+                            _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
+
+               if (error_on_inconsistent_vmcs_config)
+                       return -EIO;
+
+               _vmentry_control &= ~n_ctrl;
+               _vmexit_control &= ~x_ctrl;
+       }
+
        /*
         * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
         * can't be used due to an errata where VM Exit may incorrectly clear
@@ -2678,6 +2764,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+       vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
        vmcs_conf->vmexit_ctrl         = _vmexit_control;
        vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
@@ -3230,8 +3317,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        /*
         * We operate under the default treatment of SMM, so VMX cannot be
-        * enabled under SMM.  Note, whether or not VMXE is allowed at all is
-        * handled by kvm_is_valid_cr4().
+        * enabled under SMM.  Note, whether or not VMXE is allowed at all,
+        * i.e. is a reserved bit, is handled by common x86 code.
         */
        if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
                return false;
@@ -3702,7 +3789,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
        }
 
        /* Set up identity-mapping pagetable for EPT in real mode */
-       for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+       for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
                tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
                if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
@@ -3932,6 +4019,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
                vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
                vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
                vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+               if (enable_ipiv)
+                       vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
        }
 }
 
@@ -3977,20 +4066,26 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
        u32 i;
 
        /*
-        * Set intercept permissions for all potentially passed through MSRs
-        * again. They will automatically get filtered through the MSR filter,
-        * so we are back in sync after this.
+        * Redo intercept permissions for MSRs that KVM is passing through to
+        * the guest.  Disabling interception will check the new MSR filter and
+        * ensure that KVM enables interception if usersepace wants to filter
+        * the MSR.  MSRs that KVM is already intercepting don't need to be
+        * refreshed since KVM is going to intercept them regardless of what
+        * userspace wants.
         */
        for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
                u32 msr = vmx_possible_passthrough_msrs[i];
-               bool read = test_bit(i, vmx->shadow_msr_intercept.read);
-               bool write = test_bit(i, vmx->shadow_msr_intercept.write);
 
-               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
-               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+               if (!test_bit(i, vmx->shadow_msr_intercept.read))
+                       vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
+
+               if (!test_bit(i, vmx->shadow_msr_intercept.write))
+                       vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
        }
 
-       pt_update_intercept_for_msr(vcpu);
+       /* PT MSRs can be passed through iff PT is exposed to the guest. */
+       if (vmx_pt_mode_is_host_guest())
+               pt_update_intercept_for_msr(vcpu);
 }
 
 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4085,7 +4180,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        if (!r)
                return 0;
 
-       if (!vcpu->arch.apicv_active)
+       /* Note, this is called iff the local APIC is in-kernel. */
+       if (!vcpu->arch.apic->apicv_active)
                return -1;
 
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
@@ -4259,15 +4355,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
        }
 
        pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
-       if (cpu_has_secondary_exec_ctrls()) {
-               if (kvm_vcpu_apicv_active(vcpu))
-                       secondary_exec_controls_setbit(vmx,
-                                     SECONDARY_EXEC_APIC_REGISTER_VIRT |
-                                     SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-               else
-                       secondary_exec_controls_clearbit(vmx,
-                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
-                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+       if (kvm_vcpu_apicv_active(vcpu)) {
+               secondary_exec_controls_setbit(vmx,
+                                              SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                              SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+               if (enable_ipiv)
+                       tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+       } else {
+               secondary_exec_controls_clearbit(vmx,
+                                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+               if (enable_ipiv)
+                       tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
        }
 
        vmx_update_msr_bitmap_x2apic(vcpu);
@@ -4299,6 +4399,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
        return exec_control;
 }
 
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+       u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+       /*
+        * IPI virtualization relies on APICv. Disable IPI virtualization if
+        * APICv is inhibited.
+        */
+       if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+               exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+       return exec_control;
+}
+
 /*
  * Adjust a single secondary execution control bit to intercept/allow an
  * instruction in the guest.  This is usually done based on whether or not a
@@ -4441,13 +4555,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
        if (!vcpu->kvm->arch.bus_lock_detection_enabled)
                exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
 
+       if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+               exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
        return exec_control;
 }
 
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+       return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+       struct page *pages;
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+       if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+               return 0;
+
+       if (kvm_vmx->pid_table)
+               return 0;
+
+       pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+       if (!pages)
+               return -ENOMEM;
+
+       kvm_vmx->pid_table = (void *)page_address(pages);
+       return 0;
+}
+
+static int vmx_vcpu_precreate(struct kvm *kvm)
+{
+       return vmx_alloc_ipiv_pid_table(kvm);
+}
+
 #define VMX_XSS_EXIT_BITMAP 0
 
 static void init_vmcs(struct vcpu_vmx *vmx)
 {
+       struct kvm *kvm = vmx->vcpu.kvm;
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
        if (nested)
                nested_vmx_set_vmcs_shadowing_bitmap();
 
@@ -4464,6 +4613,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
 
+       if (cpu_has_tertiary_exec_ctrls())
+               tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
        if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
                vmcs_write64(EOI_EXIT_BITMAP1, 0);
@@ -4476,12 +4628,20 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
        }
 
-       if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+       if (vmx_can_use_ipiv(&vmx->vcpu)) {
+               vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+               vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+       }
+
+       if (!kvm_pause_in_guest(kvm)) {
                vmcs_write32(PLE_GAP, ple_gap);
                vmx->ple_window = ple_window;
                vmx->ple_window_dirty = true;
        }
 
+       if (kvm_notify_vmexit_enabled(kvm))
+               vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -4652,13 +4812,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
        exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
 }
 
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        uint32_t intr;
        int irq = vcpu->arch.interrupt.nr;
 
-       trace_kvm_inj_virq(irq);
+       trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
 
        ++vcpu->stat.irq_injections;
        if (vmx->rmode.vm86_active) {
@@ -5770,6 +5930,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+       unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+       bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+       ++vcpu->stat.notify_window_exits;
+
+       /*
+        * Notify VM exit happened while executing iret from NMI,
+        * "blocked by NMI" bit has to be set before next VM entry.
+        */
+       if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                             GUEST_INTR_STATE_NMI);
+
+       if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+           context_invalid) {
+               vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+               vcpu->run->notify.flags = context_invalid ?
+                                         KVM_NOTIFY_CONTEXT_INVALID : 0;
+               return 0;
+       }
+
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5827,6 +6013,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
        [EXIT_REASON_ENCLS]                   = handle_encls,
        [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
+       [EXIT_REASON_NOTIFY]                  = handle_notify,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -5924,6 +6111,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmentry_ctl, vmexit_ctl;
        u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+       u64 tertiary_exec_control;
        unsigned long cr4;
        int efer_slot;
 
@@ -5937,9 +6125,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
        cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
        cr4 = vmcs_readl(GUEST_CR4);
-       secondary_exec_control = 0;
+
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+       else
+               secondary_exec_control = 0;
+
+       if (cpu_has_tertiary_exec_ctrls())
+               tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+       else
+               tertiary_exec_control = 0;
 
        pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
               vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
@@ -6039,9 +6234,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
                vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
 
        pr_err("*** Control State ***\n");
-       pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
-              pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
-       pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+       pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+              cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+       pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+              pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
        pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
               vmcs_read32(EXCEPTION_BITMAP),
               vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
@@ -6191,7 +6387,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
             exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
             exit_reason.basic != EXIT_REASON_PML_FULL &&
             exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
-            exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+            exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+            exit_reason.basic != EXIT_REASON_NOTIFY)) {
                int ndata = 3;
 
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -6453,7 +6650,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
        put_page(page);
 }
 
-static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+static void vmx_hwapic_isr_update(int max_isr)
 {
        u16 status;
        u8 old;
@@ -6783,9 +6980,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 {
        int i, nr_msrs;
        struct perf_guest_switch_msr *msrs;
+       struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
+
+       pmu->host_cross_mapped_mask = 0;
+       if (pmu->pebs_enable & pmu->global_ctrl)
+               intel_pmu_cross_mapped_check(pmu);
 
        /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
-       msrs = perf_guest_get_msrs(&nr_msrs);
+       msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
        if (!msrs)
                return;
 
@@ -7166,6 +7368,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
                        goto free_vmcs;
        }
 
+       if (vmx_can_use_ipiv(vcpu))
+               WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+                          __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+
        return 0;
 
 free_vmcs:
@@ -7234,7 +7440,7 @@ static int __init vmx_check_processor_compat(void)
        return 0;
 }
 
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
        u8 cache;
 
@@ -7310,7 +7516,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
                vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
 } while (0)
 
-       entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+       entry = kvm_find_cpuid_entry(vcpu, 0x1);
        cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
        cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
        cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
@@ -7326,7 +7532,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
        cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
        cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
 
-       entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+       entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
        cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
        cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
        cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
@@ -7337,23 +7543,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 #undef cr4_fixed1_update
 }
 
-static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (kvm_mpx_supported()) {
-               bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
-
-               if (mpx_enabled) {
-                       vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
-                       vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-               } else {
-                       vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
-                       vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
-               }
-       }
-}
-
 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7361,7 +7550,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
        int i;
 
        for (i = 0; i < PT_CPUID_LEAVES; i++) {
-               best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+               best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
                if (!best)
                        return;
                vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
@@ -7445,10 +7634,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                        ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                          FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
 
-       if (nested_vmx_allowed(vcpu)) {
+       if (nested_vmx_allowed(vcpu))
                nested_vmx_cr_fixed1_bits_update(vcpu);
-               nested_vmx_entry_exit_ctls_update(vcpu);
-       }
 
        if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
                        guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
@@ -7502,6 +7689,13 @@ static __init void vmx_set_cpu_caps(void)
                kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
        if (vmx_pt_mode_is_host_guest())
                kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+       if (vmx_pebs_supported()) {
+               kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+               kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+       }
+
+       if (!enable_pmu)
+               kvm_cpu_cap_clear(X86_FEATURE_PDCM);
 
        if (!enable_sgx) {
                kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7514,7 +7708,7 @@ static __init void vmx_set_cpu_caps(void)
                kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
        /* CPUID 0xD.1 */
-       supported_xss = 0;
+       kvm_caps.supported_xss = 0;
        if (!cpu_has_vmx_xsaves())
                kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
 
@@ -7655,9 +7849,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
                delta_tsc = 0;
 
        /* Convert to host delta tsc if tsc scaling is enabled */
-       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
            delta_tsc && u64_shl_div_u64(delta_tsc,
-                               kvm_tsc_scaling_ratio_frac_bits,
+                               kvm_caps.tsc_scaling_ratio_frac_bits,
                                vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
                return -ERANGE;
 
@@ -7729,6 +7923,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+       /*
+        * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
+        * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
+        * SMI and RSM only modify state that is saved and restored via SMRAM.
+        * E.g. most MSRs are left untouched, but many are modified by VM-Exit
+        * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
+        */
        vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
        if (vmx->nested.smm.guest_mode)
                nested_vmx_vmexit(vcpu, -1, 0, 0);
@@ -7802,6 +8003,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
        return supported & BIT(reason);
 }
 
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+       free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .name = "kvm_intel",
 
@@ -7813,7 +8021,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
        .vm_size = sizeof(struct kvm_vmx),
        .vm_init = vmx_vm_init,
+       .vm_destroy = vmx_vm_destroy,
 
+       .vcpu_precreate = vmx_vcpu_precreate,
        .vcpu_create = vmx_vcpu_create,
        .vcpu_free = vmx_vcpu_free,
        .vcpu_reset = vmx_vcpu_reset,
@@ -8027,8 +8237,8 @@ static __init int hardware_setup(void)
        }
 
        if (!cpu_has_vmx_mpx())
-               supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
-                                   XFEATURE_MASK_BNDCSR);
+               kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+                                            XFEATURE_MASK_BNDCSR);
 
        if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
            !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
@@ -8091,12 +8301,16 @@ static __init int hardware_setup(void)
        if (!enable_apicv)
                vmx_x86_ops.sync_pir_to_irr = NULL;
 
+       if (!enable_apicv || !cpu_has_vmx_ipiv())
+               enable_ipiv = false;
+
        if (cpu_has_vmx_tsc_scaling())
-               kvm_has_tsc_control = true;
+               kvm_caps.has_tsc_control = true;
 
-       kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
-       kvm_tsc_scaling_ratio_frac_bits = 48;
-       kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+       kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+       kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+       kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+       kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
 
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
@@ -8153,11 +8367,12 @@ static __init int hardware_setup(void)
                vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
        }
 
-       kvm_mce_cap_supported |= MCG_LMCE_P;
+       kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+       kvm_caps.supported_mce_cap |= MCG_CMCI_P;
 
        if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
                return -EINVAL;
-       if (!enable_ept || !cpu_has_vmx_intel_pt())
+       if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
                pt_mode = PT_MODE_SYSTEM;
        if (pt_mode == PT_MODE_HOST_GUEST)
                vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;