Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Sep 2017 22:43:55 +0000 (15:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Sep 2017 22:43:55 +0000 (15:43 -0700)
Pull more KVM updates from Paolo Bonzini:
 - PPC bugfixes
 - RCU splat fix
 - swait races fix
 - pointless userspace-triggerable BUG() fix
 - misc fixes for KVM_RUN corner cases
 - nested virt correctness fixes + one host DoS
 - some cleanups
 - clang build fix
 - fix AMD AVIC with default QEMU command line options
 - x86 bugfixes

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly
  kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly
  kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry
  kvm,mips: Fix potential swait_active() races
  kvm,powerpc: Serialize wq active checks in ops->vcpu_kick
  kvm: Serialize wq active checks in kvm_vcpu_wake_up()
  kvm,x86: Fix apf_task_wake_one() wq serialization
  kvm,lapic: Justify use of swait_active()
  kvm,async_pf: Use swq_has_sleeper()
  sched/wait: Add swq_has_sleeper()
  KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
  KVM: Don't accept obviously wrong gsi values via KVM_IRQFD
  kvm: nVMX: Don't allow L2 to access the hardware CR8
  KVM: trace events: update list of exit reasons
  KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously
  KVM: X86: Don't block vCPU if there is pending exception
  KVM: SVM: Add irqchip_split() checks before enabling AVIC
  KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv()
  KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()
  KVM: x86: fix clang build
  ...

18 files changed:
arch/mips/kvm/mips.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rm_xive.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/kvm/book3s_xive_template.c
arch/x86/include/asm/kvm_host.h
arch/x86/kernel/kvm.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/lapic.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/swait.h
include/trace/events/kvm.h
virt/kvm/async_pf.c
virt/kvm/eventfd.c
virt/kvm/kvm_main.c

index bce2a6431430366ee626ddc4f6282665caeb7f06..d535edc01434117a8809fc21fb152226e0b46521 100644 (file)
@@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 
        dvcpu->arch.wait = 0;
 
-       if (swait_active(&dvcpu->wq))
+       if (swq_has_sleeper(&dvcpu->wq))
                swake_up(&dvcpu->wq);
 
        return 0;
@@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
        kvm_mips_callbacks->queue_timer_int(vcpu);
 
        vcpu->arch.wait = 0;
-       if (swait_active(&vcpu->wq))
+       if (swq_has_sleeper(&vcpu->wq))
                swake_up(&vcpu->wq);
 }
 
index 18e974a34fced58328ecb062fee539279f3df7cd..73bf1ebfa78fcc7ef74dd51713d800ea12e9e745 100644 (file)
@@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
        struct swait_queue_head *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
-       if (swait_active(wqp)) {
+       if (swq_has_sleeper(wqp)) {
                swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
@@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
        if ((cfg->process_table & PRTS_MASK) > 24)
                return -EINVAL;
 
+       mutex_lock(&kvm->lock);
        kvm->arch.process_table = cfg->process_table;
        kvmppc_setup_partition_table(kvm);
 
        lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
        kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+       mutex_unlock(&kvm->lock);
 
        return 0;
 }
index abf5f01b6eb1f04b05d0643ddc46bb8ded237d8f..5b81a807d742c8b11018ead176b60590064f5d7f 100644 (file)
@@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void)
 #define __x_tima               get_tima_phys()
 #define __x_eoi_page(xd)       ((void __iomem *)((xd)->eoi_page))
 #define __x_trig_page(xd)      ((void __iomem *)((xd)->trig_page))
-#define __x_readb      __raw_rm_readb
 #define __x_writeb     __raw_rm_writeb
 #define __x_readw      __raw_rm_readw
 #define __x_readq      __raw_rm_readq
index 663a4a861e7f3600903c61df65240fe32e58e5d3..17936f82d3c787c38327f41186f634d043d843a9 100644 (file)
@@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+       /*
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        */
        bl      kvmppc_restore_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
@@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+       /*
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        */
        bl      kvmppc_save_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
@@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        /*
         * Are we running hash or radix ?
         */
-       beq     cr2,3f
+       ld      r5, VCPU_KVM(r9)
+       lbz     r0, KVM_RADIX(r5)
+       cmpwi   cr2, r0, 0
+       beq     cr2, 3f
 
        /* Radix: Handle the case where the guest used an illegal PID */
        LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede)            /* r3 = vcpu pointer, r11 = msr, r13 = paca */
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+       /*
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        */
        ld      r9, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_save_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -2578,6 +2590,9 @@ kvm_end_cede:
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+       /*
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        */
        bl      kvmppc_restore_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
index 08b200a0bbcec8c47a3128f4685ffcbfad261552..13304622ab1c78682fa18f06120f5eb0970f57fb 100644 (file)
@@ -48,7 +48,6 @@
 #define __x_tima               xive_tima
 #define __x_eoi_page(xd)       ((void __iomem *)((xd)->eoi_mmio))
 #define __x_trig_page(xd)      ((void __iomem *)((xd)->trig_mmio))
-#define __x_readb      __raw_readb
 #define __x_writeb     __raw_writeb
 #define __x_readw      __raw_readw
 #define __x_readq      __raw_readq
index d1ed2c41b5d246dde6d53c34530bc30dc63656b9..c7a5deadd1cc782ddd45667c9fc29f96166e85e7 100644 (file)
@@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
         * bit.
         */
        if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
-               u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+               __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
+               u8 pipr = be64_to_cpu(qw1) & 0xff;
                if (pipr >= xc->hw_cppr)
                        return;
        }
@@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
        u8 pending = xc->pending;
        u32 hirq;
-       u8 pipr;
 
        pr_devel("H_IPOLL(server=%ld)\n", server);
 
@@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
                pending = 0xff;
        } else {
                /* Grab pending interrupt if any */
-               pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+               __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
+               u8 pipr = be64_to_cpu(qw1) & 0xff;
                if (pipr < 8)
                        pending |= 1 << pipr;
        }
index 8844eee290b268b11a8404b1d3305551c6b98203..c73e493adf0748108c31389ca62437267ea877e2 100644 (file)
@@ -951,7 +951,6 @@ struct kvm_x86_ops {
        void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-       u32 (*get_pkru)(struct kvm_vcpu *vcpu);
 
        void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
@@ -973,7 +972,7 @@ struct kvm_x86_ops {
        void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
        void (*enable_irq_window)(struct kvm_vcpu *vcpu);
        void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-       bool (*get_enable_apicv)(void);
+       bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
        void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
        void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
        void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
index 874827b0d7ca2e3a0f79d40ee7c2b9e84e9bb583..aa60a08b65b1090392b542ec7dc642e9827b7606 100644 (file)
@@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
        hlist_del_init(&n->link);
        if (n->halted)
                smp_send_reschedule(n->cpu);
-       else if (swait_active(&n->wq))
+       else if (swq_has_sleeper(&n->wq))
                swake_up(&n->wq);
 }
 
index 1ea3c0e1e3a9a3df6d6061e1e367abf632c50b79..0bc5c1315708e6aad8b409d377be41e2291efb40 100644 (file)
@@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
 {
        unsigned x86_leaf = x86_feature / 32;
 
-       BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
        BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
        BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
 
index aaf10b6f5380d6d41531b25365b433eac9d42c48..69c5612be786fbc7909b6f81a8d0a29f2cd324a6 100644 (file)
@@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic)
        atomic_inc(&apic->lapic_timer.pending);
        kvm_set_pending_timer(vcpu);
 
+       /*
+        * For x86, the atomic_inc() is serialized, thus
+        * using swait_active() is safe.
+        */
        if (swait_active(q))
                swake_up(q);
 
index 2c1cfe68a9af1e11761f1938deea2d6d1fe51ab5..0e68f0b3cbf72064f36bfedcb997305fde124911 100644 (file)
@@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
        vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
        vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
        vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
-       svm->vcpu.arch.apicv_active = true;
 }
 
 static void init_vmcb(struct vcpu_svm *svm)
@@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm)
                set_intercept(svm, INTERCEPT_PAUSE);
        }
 
-       if (avic)
+       if (kvm_vcpu_apicv_active(&svm->vcpu))
                avic_init_vmcb(svm);
 
        /*
@@ -1600,6 +1599,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
+static int avic_init_vcpu(struct vcpu_svm *svm)
+{
+       int ret;
+
+       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+               return 0;
+
+       ret = avic_init_backing_page(&svm->vcpu);
+       if (ret)
+               return ret;
+
+       INIT_LIST_HEAD(&svm->ir_list);
+       spin_lock_init(&svm->ir_list_lock);
+
+       return ret;
+}
+
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
        struct vcpu_svm *svm;
@@ -1636,14 +1652,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        if (!hsave_page)
                goto free_page3;
 
-       if (avic) {
-               err = avic_init_backing_page(&svm->vcpu);
-               if (err)
-                       goto free_page4;
-
-               INIT_LIST_HEAD(&svm->ir_list);
-               spin_lock_init(&svm->ir_list_lock);
-       }
+       err = avic_init_vcpu(svm);
+       if (err)
+               goto free_page4;
 
        /* We initialize this flag to true to make sure that the is_running
         * bit would be set the first time the vcpu is loaded.
@@ -4395,9 +4406,9 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
        return;
 }
 
-static bool svm_get_enable_apicv(void)
+static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
-       return avic;
+       return avic && irqchip_split(vcpu->kvm);
 }
 
 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
@@ -4414,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;
 
-       if (!avic)
+       if (!kvm_vcpu_apicv_active(&svm->vcpu))
                return;
 
        vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
@@ -5302,6 +5313,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
                 */
                if (info->rep_prefix != REPE_PREFIX)
                        goto out;
+               break;
        case SVM_EXIT_IOIO: {
                u64 exit_info;
                u32 bytes;
index 699704d4bc9e7716e04da7298ec7f4491b436af1..06c0c6d0541e9bf95eabbcaa8d20c8ec45f19496 100644 (file)
@@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ
        }
 }
 
-static bool vmx_get_enable_apicv(void)
+static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
        return enable_apicv;
 }
@@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
-                               vmcs_readl(EXIT_QUALIFICATION),
-                               vmx->idt_vectoring_info,
-                               intr_info,
-                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-                               KVM_ISA_VMX);
+       if (vmx->nested.nested_run_pending)
+               return false;
+
+       if (unlikely(vmx->fail)) {
+               pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+                                   vmcs_read32(VM_INSTRUCTION_ERROR));
+               return true;
+       }
 
        /*
         * The host physical addresses of some pages of guest memory
@@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
         */
        nested_mark_vmcs12_pages_dirty(vcpu);
 
-       if (vmx->nested.nested_run_pending)
-               return false;
-
-       if (unlikely(vmx->fail)) {
-               pr_info_ratelimited("%s failed vm entry %x\n", __func__,
-                                   vmcs_read32(VM_INSTRUCTION_ERROR));
-               return true;
-       }
+       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+                               vmcs_readl(EXIT_QUALIFICATION),
+                               vmx->idt_vectoring_info,
+                               intr_info,
+                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+                               KVM_ISA_VMX);
 
        switch (exit_reason) {
        case EXIT_REASON_EXCEPTION_NMI:
@@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                                  | (1 << VCPU_EXREG_CR3));
        vcpu->arch.regs_dirty = 0;
 
-       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
-       vmx->loaded_vmcs->launched = 1;
-
-       vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-
        /*
         * eager fpu is enabled if PKEY is supported and CR4 is switched
         * back on host, so it is safe to read guest PKRU from current
@@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        vmx->nested.nested_run_pending = 0;
+       vmx->idt_vectoring_info = 0;
+
+       vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+       if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+               return;
+
+       vmx->loaded_vmcs->launched = 1;
+       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
@@ -10525,6 +10527,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        if (exec_control & CPU_BASED_TPR_SHADOW) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       } else {
+#ifdef CONFIG_X86_64
+               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+                               CPU_BASED_CR8_STORE_EXITING;
+#endif
        }
 
        /*
@@ -11388,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       u32 vm_inst_error = 0;
 
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
+       /*
+        * The only expected VM-instruction error is "VM entry with
+        * invalid control field(s)." Anything else indicates a
+        * problem with L0.
+        */
+       WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                                  VMXERR_ENTRY_INVALID_CONTROL_FIELD));
+
        leave_guest_mode(vcpu);
-       prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
-                      exit_qualification);
 
-       if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
-                                vmcs12->vm_exit_msr_store_count))
-               nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       if (likely(!vmx->fail)) {
+               prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+                              exit_qualification);
 
-       if (unlikely(vmx->fail))
-               vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+               if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+                                        vmcs12->vm_exit_msr_store_count))
+                       nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       }
 
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-
-       /*
-        * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
-        * the VM-exit interrupt information (valid interrupt) is always set to
-        * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
-        * kvm_cpu_has_interrupt().  See the commit message for details.
-        */
-       if (nested_exit_intr_ack_set(vcpu) &&
-           exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
-           kvm_cpu_has_interrupt(vcpu)) {
-               int irq = kvm_cpu_get_interrupt(vcpu);
-               WARN_ON(irq < 0);
-               vmcs12->vm_exit_intr_info = irq |
-                       INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
-       }
-
-       trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
-                                      vmcs12->exit_qualification,
-                                      vmcs12->idt_vectoring_info_field,
-                                      vmcs12->vm_exit_intr_info,
-                                      vmcs12->vm_exit_intr_error_code,
-                                      KVM_ISA_VMX);
-
        vm_entry_controls_reset_shadow(vmx);
        vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
@@ -11436,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        if (VMCS02_POOL_SIZE == 0)
                nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
 
-       load_vmcs12_host_state(vcpu, vmcs12);
-
        /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
@@ -11486,21 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         */
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-       /*
-        * Exiting from L2 to L1, we're now back to L1 which thinks it just
-        * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
-        * success or failure flag accordingly.
-        */
-       if (unlikely(vmx->fail)) {
-               vmx->fail = 0;
-               nested_vmx_failValid(vcpu, vm_inst_error);
-       } else
-               nested_vmx_succeed(vcpu);
        if (enable_shadow_vmcs)
                vmx->nested.sync_shadow_vmcs = true;
 
        /* in case we halted in L2 */
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+       if (likely(!vmx->fail)) {
+               /*
+                * TODO: SDM says that with acknowledge interrupt on
+                * exit, bit 31 of the VM-exit interrupt information
+                * (valid interrupt) is always set to 1 on
+                * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
+                * need kvm_cpu_has_interrupt().  See the commit
+                * message for details.
+                */
+               if (nested_exit_intr_ack_set(vcpu) &&
+                   exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+                   kvm_cpu_has_interrupt(vcpu)) {
+                       int irq = kvm_cpu_get_interrupt(vcpu);
+                       WARN_ON(irq < 0);
+                       vmcs12->vm_exit_intr_info = irq |
+                               INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+               }
+
+               trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+                                              vmcs12->exit_qualification,
+                                              vmcs12->idt_vectoring_info_field,
+                                              vmcs12->vm_exit_intr_info,
+                                              vmcs12->vm_exit_intr_error_code,
+                                              KVM_ISA_VMX);
+
+               load_vmcs12_host_state(vcpu, vmcs12);
+
+               return;
+       }
+       
+       /*
+        * After an early L2 VM-entry failure, we're now back
+        * in L1 which thinks it just finished a VMLAUNCH or
+        * VMRESUME instruction, so we need to set the failure
+        * flag and the VM-instruction error field of the VMCS
+        * accordingly.
+        */
+       nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+       /*
+        * The emulated instruction was already skipped in
+        * nested_vmx_run, but the updated RIP was never
+        * written back to the vmcs01.
+        */
+       skip_emulated_instruction(vcpu);
+       vmx->fail = 0;
 }
 
 /*
@@ -11829,7 +11854,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
        struct kvm_lapic_irq irq;
        struct kvm_vcpu *vcpu;
        struct vcpu_data vcpu_info;
-       int idx, ret = -EINVAL;
+       int idx, ret = 0;
 
        if (!kvm_arch_has_assigned_device(kvm) ||
                !irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11838,7 +11863,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
        idx = srcu_read_lock(&kvm->irq_srcu);
        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-       BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+       if (guest_irq >= irq_rt->nr_rt_entries ||
+           hlist_empty(&irq_rt->map[guest_irq])) {
+               pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+                            guest_irq, irq_rt->nr_rt_entries);
+               goto out;
+       }
 
        hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
                if (e->type != KVM_IRQ_ROUTING_MSI)
index 6069af86da3b95884861f1e3a9f7d209ad729b3c..cd17b7d9a1076c1d28904bc4cd1ea06af5e0f2c0 100644 (file)
@@ -7231,10 +7231,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+               if (kvm_run->immediate_exit) {
+                       r = -EINTR;
+                       goto out;
+               }
                kvm_vcpu_block(vcpu);
                kvm_apic_accept_events(vcpu);
                kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                r = -EAGAIN;
+               if (signal_pending(current)) {
+                       r = -EINTR;
+                       vcpu->run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.signal_exits;
+               }
                goto out;
        }
 
@@ -7971,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
 
-       vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
+       vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
@@ -8452,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
        if (vcpu->arch.pv.pv_unhalted)
                return true;
 
+       if (vcpu->arch.exception.pending)
+               return true;
+
        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
            (vcpu->arch.nmi_pending &&
             kvm_x86_ops->nmi_allowed(vcpu)))
@@ -8619,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
                                      sizeof(val));
 }
 
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+       return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+                                     sizeof(u32));
+}
+
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
                                     struct kvm_async_pf *work)
 {
@@ -8646,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                                 struct kvm_async_pf *work)
 {
        struct x86_exception fault;
+       u32 val;
 
        if (work->wakeup_all)
                work->arch.token = ~0; /* broadcast wakeup */
@@ -8653,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
        trace_kvm_async_pf_ready(work->arch.token, work->gva);
 
-       if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
-           !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
-               fault.vector = PF_VECTOR;
-               fault.error_code_valid = true;
-               fault.error_code = 0;
-               fault.nested_page_fault = false;
-               fault.address = work->arch.token;
-               fault.async_page_fault = true;
-               kvm_inject_page_fault(vcpu, &fault);
+       if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+           !apf_get_user(vcpu, &val)) {
+               if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+                   vcpu->arch.exception.pending &&
+                   vcpu->arch.exception.nr == PF_VECTOR &&
+                   !apf_put_user(vcpu, 0)) {
+                       vcpu->arch.exception.injected = false;
+                       vcpu->arch.exception.pending = false;
+                       vcpu->arch.exception.nr = 0;
+                       vcpu->arch.exception.has_error_code = false;
+                       vcpu->arch.exception.error_code = 0;
+               } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+                       fault.vector = PF_VECTOR;
+                       fault.error_code_valid = true;
+                       fault.error_code = 0;
+                       fault.nested_page_fault = false;
+                       fault.address = work->arch.token;
+                       fault.async_page_fault = true;
+                       kvm_inject_page_fault(vcpu, &fault);
+               }
        }
        vcpu->arch.apf.halted = false;
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
index 4a4e180d0a3572fbb0719677db50d644a40c8f63..73e97a08d3d0a9be1e5467b11d1b0a0ad235b063 100644 (file)
@@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
        DECLARE_SWAIT_QUEUE_HEAD(name)
 #endif
 
-static inline int swait_active(struct swait_queue_head *q)
+/**
+ * swait_active -- locklessly test for waiters on the queue
+ * @wq: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * NOTE2: this function has the same above implications as regular waitqueues.
+ *
+ * Use either while holding swait_queue_head::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ *      CPU0 - waker                    CPU1 - waiter
+ *
+ *                                      for (;;) {
+ *      @cond = true;                     prepare_to_swait(&wq_head, &wait, state);
+ *      smp_mb();                         // smp_mb() from set_current_state()
+ *      if (swait_active(wq_head))        if (@cond)
+ *        wake_up(wq_head);                      break;
+ *                                        schedule();
+ *                                      }
+ *                                      finish_swait(&wq_head, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * swait_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ * This, in turn, can trigger missing wakeups.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
+static inline int swait_active(struct swait_queue_head *wq)
+{
+       return !list_empty(&wq->task_list);
+}
+
+/**
+ * swq_has_sleeper - check if there are any waiting processes
+ * @wq: the waitqueue to test for waiters
+ *
+ * Returns true if @wq has waiting processes
+ *
+ * Please refer to the comment for swait_active.
+ */
+static inline bool swq_has_sleeper(struct swait_queue_head *wq)
 {
-       return !list_empty(&q->task_list);
+       /*
+        * We need to be sure we are in sync with the list_add()
+        * modifications to the wait queue (task_list).
+        *
+        * This memory barrier should be paired with one on the
+        * waiting side.
+        */
+       smp_mb();
+       return swait_active(wq);
 }
 
 extern void swake_up(struct swait_queue_head *q);
index 8ade3eb6c640e0725e2c1fe0d930e6309cd0801f..dcffedfac43109ae4c898bb57b3c161b49405e57 100644 (file)
@@ -14,7 +14,9 @@
        ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR),    \
        ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
        ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL),   \
-       ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH)
+       ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
+       ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI),          \
+       ERSN(HYPERV)
 
 TRACE_EVENT(kvm_userspace_exit,
            TP_PROTO(__u32 reason, int errno),
index bb298a200cd3f212d13dd509303eef211b9346de..57bcb27dcf30f61e14361c675617f08a0e995600 100644 (file)
@@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work)
 
        trace_kvm_async_pf_completed(addr, gva);
 
-       /*
-        * This memory barrier pairs with prepare_to_wait's set_current_state()
-        */
-       smp_mb();
-       if (swait_active(&vcpu->wq))
+       if (swq_has_sleeper(&vcpu->wq))
                swake_up(&vcpu->wq);
 
        mmput(mm);
index f2ac53ab82438f0b473ecd8ed91b1e2548af7ca2..c608ab495282ddb63ace657ce0a6deeea18a2240 100644 (file)
@@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 {
        if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
                return -EINVAL;
+       if (args->gsi >= KVM_MAX_IRQ_ROUTES)
+               return -EINVAL;
 
        if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
                return kvm_irqfd_deassign(kvm, args);
index 6ed1c2021198d53a6a8f3153ec1605dba57a2692..9deb5a245b83032ffaa960d42cd6f01ecb614b1b 100644 (file)
@@ -674,6 +674,7 @@ out_err_no_irq_srcu:
 out_err_no_srcu:
        hardware_disable_all();
 out_err_no_disable:
+       refcount_set(&kvm->users_count, 0);
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm_get_bus(kvm, i));
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
@@ -2186,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
        struct swait_queue_head *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
-       if (swait_active(wqp)) {
+       if (swq_has_sleeper(wqp)) {
                swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
                return true;