Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Sep 2017 22:43:55 +0000 (15:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Sep 2017 22:43:55 +0000 (15:43 -0700)
Pull more KVM updates from Paolo Bonzini:
 - PPC bugfixes
 - RCU splat fix
 - swait races fix
 - pointless userspace-triggerable BUG() fix
 - misc fixes for KVM_RUN corner cases
 - nested virt correctness fixes + one host DoS
 - some cleanups
 - clang build fix
 - fix AMD AVIC with default QEMU command line options
 - x86 bugfixes

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly
  kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly
  kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry
  kvm,mips: Fix potential swait_active() races
  kvm,powerpc: Serialize wq active checks in ops->vcpu_kick
  kvm: Serialize wq active checks in kvm_vcpu_wake_up()
  kvm,x86: Fix apf_task_wake_one() wq serialization
  kvm,lapic: Justify use of swait_active()
  kvm,async_pf: Use swq_has_sleeper()
  sched/wait: Add swq_has_sleeper()
  KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
  KVM: Don't accept obviously wrong gsi values via KVM_IRQFD
  kvm: nVMX: Don't allow L2 to access the hardware CR8
  KVM: trace events: update list of exit reasons
  KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously
  KVM: X86: Don't block vCPU if there is pending exception
  KVM: SVM: Add irqchip_split() checks before enabling AVIC
  KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv()
  KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()
  KVM: x86: fix clang build
  ...

1  2 
arch/x86/kvm/vmx.c

diff --combined arch/x86/kvm/vmx.c
index 699704d4bc9e7716e04da7298ec7f4491b436af1,a406afbb6d211ebfbfa4691cfc716ba6884b4334..06c0c6d0541e9bf95eabbcaa8d20c8ec45f19496
@@@ -5012,7 -5012,7 +5012,7 @@@ static void vmx_disable_intercept_msr_x
        }
  }
  
- static bool vmx_get_enable_apicv(void)
+ static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
  {
        return enable_apicv;
  }
@@@ -5192,7 -5192,7 +5192,7 @@@ static void vmx_set_constant_host_state
        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
  
 -      native_store_idt(&dt);
 +      store_idt(&dt);
        vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
        vmx->host_idt_base = dt.address;
  
@@@ -8344,12 -8344,14 +8344,14 @@@ static bool nested_vmx_exit_reflected(s
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
-       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
-                               vmcs_readl(EXIT_QUALIFICATION),
-                               vmx->idt_vectoring_info,
-                               intr_info,
-                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-                               KVM_ISA_VMX);
+       if (vmx->nested.nested_run_pending)
+               return false;
+       if (unlikely(vmx->fail)) {
+               pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+                                   vmcs_read32(VM_INSTRUCTION_ERROR));
+               return true;
+       }
  
        /*
         * The host physical addresses of some pages of guest memory
         */
        nested_mark_vmcs12_pages_dirty(vcpu);
  
-       if (vmx->nested.nested_run_pending)
-               return false;
-       if (unlikely(vmx->fail)) {
-               pr_info_ratelimited("%s failed vm entry %x\n", __func__,
-                                   vmcs_read32(VM_INSTRUCTION_ERROR));
-               return true;
-       }
+       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+                               vmcs_readl(EXIT_QUALIFICATION),
+                               vmx->idt_vectoring_info,
+                               intr_info,
+                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+                               KVM_ISA_VMX);
  
        switch (exit_reason) {
        case EXIT_REASON_EXCEPTION_NMI:
@@@ -9424,12 -9424,6 +9424,6 @@@ static void __noclone vmx_vcpu_run(stru
                                  | (1 << VCPU_EXREG_CR3));
        vcpu->arch.regs_dirty = 0;
  
-       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-       vmx->loaded_vmcs->launched = 1;
-       vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
        /*
         * eager fpu is enabled if PKEY is supported and CR4 is switched
         * back on host, so it is safe to read guest PKRU from current
                kvm_make_request(KVM_REQ_EVENT, vcpu);
  
        vmx->nested.nested_run_pending = 0;
+       vmx->idt_vectoring_info = 0;
+       vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+       if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+               return;
+       vmx->loaded_vmcs->launched = 1;
+       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
  
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
@@@ -10525,6 -10527,11 +10527,11 @@@ static int prepare_vmcs02(struct kvm_vc
        if (exec_control & CPU_BASED_TPR_SHADOW) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       } else {
+ #ifdef CONFIG_X86_64
+               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+                               CPU_BASED_CR8_STORE_EXITING;
+ #endif
        }
  
        /*
@@@ -11388,46 -11395,30 +11395,30 @@@ static void nested_vmx_vmexit(struct kv
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       u32 vm_inst_error = 0;
  
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
  
+       /*
+        * The only expected VM-instruction error is "VM entry with
+        * invalid control field(s)." Anything else indicates a
+        * problem with L0.
+        */
+       WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                                  VMXERR_ENTRY_INVALID_CONTROL_FIELD));
        leave_guest_mode(vcpu);
-       prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
-                      exit_qualification);
  
-       if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
-                                vmcs12->vm_exit_msr_store_count))
-               nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       if (likely(!vmx->fail)) {
+               prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+                              exit_qualification);
  
-       if (unlikely(vmx->fail))
-               vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+               if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+                                        vmcs12->vm_exit_msr_store_count))
+                       nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       }
  
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       /*
-        * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
-        * the VM-exit interrupt information (valid interrupt) is always set to
-        * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
-        * kvm_cpu_has_interrupt().  See the commit message for details.
-        */
-       if (nested_exit_intr_ack_set(vcpu) &&
-           exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
-           kvm_cpu_has_interrupt(vcpu)) {
-               int irq = kvm_cpu_get_interrupt(vcpu);
-               WARN_ON(irq < 0);
-               vmcs12->vm_exit_intr_info = irq |
-                       INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
-       }
-       trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
-                                      vmcs12->exit_qualification,
-                                      vmcs12->idt_vectoring_info_field,
-                                      vmcs12->vm_exit_intr_info,
-                                      vmcs12->vm_exit_intr_error_code,
-                                      KVM_ISA_VMX);
        vm_entry_controls_reset_shadow(vmx);
        vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
        if (VMCS02_POOL_SIZE == 0)
                nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
  
-       load_vmcs12_host_state(vcpu, vmcs12);
        /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
         */
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
-       /*
-        * Exiting from L2 to L1, we're now back to L1 which thinks it just
-        * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
-        * success or failure flag accordingly.
-        */
-       if (unlikely(vmx->fail)) {
-               vmx->fail = 0;
-               nested_vmx_failValid(vcpu, vm_inst_error);
-       } else
-               nested_vmx_succeed(vcpu);
        if (enable_shadow_vmcs)
                vmx->nested.sync_shadow_vmcs = true;
  
        /* in case we halted in L2 */
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       if (likely(!vmx->fail)) {
+               /*
+                * TODO: SDM says that with acknowledge interrupt on
+                * exit, bit 31 of the VM-exit interrupt information
+                * (valid interrupt) is always set to 1 on
+                * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
+                * need kvm_cpu_has_interrupt().  See the commit
+                * message for details.
+                */
+               if (nested_exit_intr_ack_set(vcpu) &&
+                   exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+                   kvm_cpu_has_interrupt(vcpu)) {
+                       int irq = kvm_cpu_get_interrupt(vcpu);
+                       WARN_ON(irq < 0);
+                       vmcs12->vm_exit_intr_info = irq |
+                               INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+               }
+               trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+                                              vmcs12->exit_qualification,
+                                              vmcs12->idt_vectoring_info_field,
+                                              vmcs12->vm_exit_intr_info,
+                                              vmcs12->vm_exit_intr_error_code,
+                                              KVM_ISA_VMX);
+               load_vmcs12_host_state(vcpu, vmcs12);
+               return;
+       }
+       
+       /*
+        * After an early L2 VM-entry failure, we're now back
+        * in L1 which thinks it just finished a VMLAUNCH or
+        * VMRESUME instruction, so we need to set the failure
+        * flag and the VM-instruction error field of the VMCS
+        * accordingly.
+        */
+       nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+       /*
+        * The emulated instruction was already skipped in
+        * nested_vmx_run, but the updated RIP was never
+        * written back to the vmcs01.
+        */
+       skip_emulated_instruction(vcpu);
+       vmx->fail = 0;
  }
  
  /*
@@@ -11829,7 -11854,7 +11854,7 @@@ static int vmx_update_pi_irte(struct kv
        struct kvm_lapic_irq irq;
        struct kvm_vcpu *vcpu;
        struct vcpu_data vcpu_info;
-       int idx, ret = -EINVAL;
+       int idx, ret = 0;
  
        if (!kvm_arch_has_assigned_device(kvm) ||
                !irq_remapping_cap(IRQ_POSTING_CAP) ||
  
        idx = srcu_read_lock(&kvm->irq_srcu);
        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-       BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+       if (guest_irq >= irq_rt->nr_rt_entries ||
+           hlist_empty(&irq_rt->map[guest_irq])) {
+               pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+                            guest_irq, irq_rt->nr_rt_entries);
+               goto out;
+       }
  
        hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
                if (e->type != KVM_IRQ_ROUTING_MSI)