KVM: x86: improve the usability of the 'kvm_pio' tracepoint
[sfrench/cifs-2.6.git] / arch / x86 / kvm / x86.c
index 2b8578432d5bccd296fa6d5859e3575c0fe4aa02..de0931cb3f58c01a447e80f7fd7a7f797140b8bf 100644 (file)
@@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
-       u64 xcr0;
+       u64 xcr0 = xcr;
+       u64 old_xcr0 = vcpu->arch.xcr0;
        u64 valid_bits;
 
        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
        if (index != XCR_XFEATURE_ENABLED_MASK)
                return 1;
-       xcr0 = xcr;
        if (!(xcr0 & XSTATE_FP))
                return 1;
        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -616,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
        if (xcr0 & ~valid_bits)
                return 1;
 
+       if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+               return 1;
+
        kvm_put_guest_xcr0(vcpu);
        vcpu->arch.xcr0 = xcr0;
+
+       if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+               kvm_update_cpuid(vcpu);
        return 0;
 }
 
@@ -646,6 +652,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
                return 1;
 
+       if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
+               return 1;
+
        if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
                return 1;
 
@@ -674,6 +683,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 
+       if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
+               update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
+
        if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
                kvm_update_cpuid(vcpu);
 
@@ -689,26 +701,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                return 0;
        }
 
-       if (is_long_mode(vcpu)) {
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
-                       if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
-                               return 1;
-               } else
-                       if (cr3 & CR3_L_MODE_RESERVED_BITS)
-                               return 1;
-       } else {
-               if (is_pae(vcpu)) {
-                       if (cr3 & CR3_PAE_RESERVED_BITS)
-                               return 1;
-                       if (is_paging(vcpu) &&
-                           !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
-                               return 1;
-               }
-               /*
-                * We don't check reserved bits in nonpae mode, because
-                * this isn't enforced, and VMware depends on this.
-                */
-       }
+       if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS))
+               return 1;
+       if (is_pae(vcpu) && is_paging(vcpu) &&
+           !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               return 1;
 
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
@@ -753,7 +750,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
        else
                dr7 = vcpu->arch.dr7;
        kvm_x86_ops->set_dr7(vcpu, dr7);
-       vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+       if (dr7 & DR7_BP_EN_MASK)
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
 }
 
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -879,7 +878,7 @@ static u32 msrs_to_save[] = {
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
-       MSR_IA32_FEATURE_CONTROL
+       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
 };
 
 static unsigned num_msrs_to_save;
@@ -1109,7 +1108,6 @@ static inline u64 get_kernel_ns(void)
 {
        struct timespec ts;
 
-       WARN_ON(preemptible());
        ktime_get_ts(&ts);
        monotonic_to_bootbased(&ts);
        return timespec_to_ns(&ts);
@@ -1581,7 +1579,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
-       vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
 
        /*
@@ -1623,14 +1620,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
  * the others.
  *
  * So in those cases, request a kvmclock update for all vcpus.
- * The worst case for a remote vcpu to update its kvmclock
- * is then bounded by maximum nohz sleep latency.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
  */
 
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
 {
        int i;
-       struct kvm *kvm = v->kvm;
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+                                          kvmclock_update_work);
+       struct kvm *kvm = container_of(ka, struct kvm, arch);
        struct kvm_vcpu *vcpu;
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1639,6 +1643,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
        }
 }
 
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+       struct kvm *kvm = v->kvm;
+
+       set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+       schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+                                       KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+                                          kvmclock_sync_work);
+       struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+       schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                       KVMCLOCK_SYNC_PERIOD);
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
        switch (msr) {
@@ -2323,9 +2350,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case HV_X64_MSR_VP_INDEX: {
                int r;
                struct kvm_vcpu *v;
-               kvm_for_each_vcpu(r, v, vcpu->kvm)
-                       if (v == vcpu)
+               kvm_for_each_vcpu(r, v, vcpu->kvm) {
+                       if (v == vcpu) {
                                data = r;
+                               break;
+                       }
+               }
                break;
        }
        case HV_X64_MSR_EOI:
@@ -2599,6 +2629,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQ_INJECT_STATUS:
        case KVM_CAP_IRQFD:
        case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_IOEVENTFD_NO_LENGTH:
        case KVM_CAP_PIT2:
        case KVM_CAP_PIT_STATE2:
        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
@@ -2617,6 +2648,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_KVMCLOCK_CTRL:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_HYPERV_TIME:
+       case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -3043,9 +3075,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                 * with old userspace.
                 */
-               if (xstate_bv & ~KVM_SUPPORTED_XCR0)
-                       return -EINVAL;
-               if (xstate_bv & ~host_xcr0)
+               if (xstate_bv & ~kvm_supported_xcr0())
                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->xsave,
                        guest_xsave->region, vcpu->arch.guest_xstate_size);
@@ -3602,11 +3632,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
                offset = i * BITS_PER_LONG;
                kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
        }
-       if (is_dirty)
-               kvm_flush_remote_tlbs(kvm);
 
        spin_unlock(&kvm->mmu_lock);
 
+       /* See the comments in kvm_mmu_slot_remove_write_access(). */
+       lockdep_assert_held(&kvm->slots_lock);
+
+       /*
+        * All the TLBs can be flushed out of mmu lock, see the comments in
+        * kvm_mmu_slot_remove_write_access().
+        */
+       if (is_dirty)
+               kvm_flush_remote_tlbs(kvm);
+
        r = -EFAULT;
        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
                goto out;
@@ -3898,6 +3936,23 @@ static void kvm_init_msr_list(void)
        for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
                        continue;
+
+               /*
+                * Even MSRs that are valid in the host may not be exposed
+                * to the guests in some cases.  We could work around this
+                * in VMX with the generic MSR save/load machinery, but it
+                * is not really worthwhile since it will really only
+                * happen with nested virtualization.
+                */
+               switch (msrs_to_save[i]) {
+               case MSR_IA32_BNDCFGS:
+                       if (!kvm_x86_ops->mpx_supported())
+                               continue;
+                       break;
+               default:
+                       break;
+               }
+
                if (j < i)
                        msrs_to_save[j] = msrs_to_save[i];
                j++;
@@ -4108,7 +4163,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
                | (write ? PFERR_WRITE_MASK : 0);
 
        if (vcpu_match_mmio_gva(vcpu, gva)
-           && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
+           && !permission_fault(vcpu, vcpu->arch.walk_mmu,
+                                vcpu->arch.access, access)) {
                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                        (gva & (PAGE_SIZE - 1));
                trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -4394,6 +4450,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
        if (!exchanged)
                return X86EMUL_CMPXCHG_FAILED;
 
+       mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
        kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 
        return X86EMUL_CONTINUE;
@@ -4423,8 +4480,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
                               unsigned short port, void *val,
                               unsigned int count, bool in)
 {
-       trace_kvm_pio(!in, port, size, count);
-
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = in;
        vcpu->arch.pio.count  = count;
@@ -4459,6 +4514,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
        if (ret) {
 data_avail:
                memcpy(val, vcpu->arch.pio_data, size * count);
+               trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
                vcpu->arch.pio.count = 0;
                return 1;
        }
@@ -4473,6 +4529,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
        memcpy(vcpu->arch.pio_data, val, size * count);
+       trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 
@@ -4839,7 +4896,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
        ctxt->eip = kvm_rip_read(vcpu);
        ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
                     (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
-                    cs_l                               ? X86EMUL_MODE_PROT64 :
+                    (cs_l && is_long_mode(vcpu))       ? X86EMUL_MODE_PROT64 :
                     cs_db                              ? X86EMUL_MODE_PROT32 :
                                                          X86EMUL_MODE_PROT16;
        ctxt->guest_mode = is_guest_mode(vcpu);
@@ -5365,7 +5422,8 @@ static void kvm_timer_init(void)
        int cpu;
 
        max_tsc_khz = tsc_khz;
-       register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+
+       cpu_notifier_register_begin();
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 #ifdef CONFIG_CPU_FREQ
                struct cpufreq_policy policy;
@@ -5382,6 +5440,10 @@ static void kvm_timer_init(void)
        pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
        for_each_online_cpu(cpu)
                smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+
+       __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+       cpu_notifier_register_done();
+
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -5537,9 +5599,10 @@ int kvm_arch_init(void *opaque)
                goto out_free_percpu;
 
        kvm_set_mmio_spte_mask();
-       kvm_init_msr_list();
 
        kvm_x86_ops = ops;
+       kvm_init_msr_list();
+
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
@@ -5782,8 +5845,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_pending_event(struct kvm_vcpu *vcpu)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 {
+       int r;
+
        /* try to reinject previous events if any */
        if (vcpu->arch.exception.pending) {
                trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5793,17 +5858,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                                          vcpu->arch.exception.has_error_code,
                                          vcpu->arch.exception.error_code,
                                          vcpu->arch.exception.reinject);
-               return;
+               return 0;
        }
 
        if (vcpu->arch.nmi_injected) {
                kvm_x86_ops->set_nmi(vcpu);
-               return;
+               return 0;
        }
 
        if (vcpu->arch.interrupt.pending) {
                kvm_x86_ops->set_irq(vcpu);
-               return;
+               return 0;
+       }
+
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+               r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+               if (r != 0)
+                       return r;
        }
 
        /* try to inject new event if pending */
@@ -5820,6 +5891,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                        kvm_x86_ops->set_irq(vcpu);
                }
        }
+       return 0;
 }
 
 static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5924,15 +5996,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               inject_pending_event(vcpu);
-
+               if (inject_pending_event(vcpu, req_int_win) != 0)
+                       req_immediate_exit = true;
                /* enable NMI/IRQ window open exits if needed */
-               if (vcpu->arch.nmi_pending)
-                       req_immediate_exit =
-                               kvm_x86_ops->enable_nmi_window(vcpu) != 0;
+               else if (vcpu->arch.nmi_pending)
+                       kvm_x86_ops->enable_nmi_window(vcpu);
                else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-                       req_immediate_exit =
-                               kvm_x86_ops->enable_irq_window(vcpu) != 0;
+                       kvm_x86_ops->enable_irq_window(vcpu);
 
                if (kvm_lapic_enabled(vcpu)) {
                        /*
@@ -5992,11 +6062,27 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(vcpu->arch.eff_db[1], 1);
                set_debugreg(vcpu->arch.eff_db[2], 2);
                set_debugreg(vcpu->arch.eff_db[3], 3);
+               set_debugreg(vcpu->arch.dr6, 6);
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu);
 
+       /*
+        * Do this here before restoring debug registers on the host.  And
+        * since we do this before handling the vmexit, a DR access vmexit
+        * can (a) read the correct value of the debug registers, (b) set
+        * KVM_DEBUGREG_WONT_EXIT again.
+        */
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+               int i;
+
+               WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+               kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+               for (i = 0; i < KVM_NR_DB_REGS; i++)
+                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+       }
+
        /*
         * If the guest has used debug registers, at least dr7
         * will be disabled while returning to the host.
@@ -6711,6 +6797,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
        int r;
        struct msr_data msr;
+       struct kvm *kvm = vcpu->kvm;
 
        r = vcpu_load(vcpu);
        if (r)
@@ -6721,6 +6808,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        kvm_write_tsc(vcpu, &msr);
        vcpu_put(vcpu);
 
+       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                       KVMCLOCK_SYNC_PERIOD);
+
        return r;
 }
 
@@ -7013,6 +7103,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        pvclock_update_vm_gtod_copy(kvm);
 
+       INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+       INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
        return 0;
 }
 
@@ -7050,6 +7143,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_sync_events(struct kvm *kvm)
 {
+       cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+       cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
        kvm_free_all_assigned_devices(kvm);
        kvm_free_pit(kvm);
 }
@@ -7228,8 +7323,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
        /*
         * Write protect all pages for dirty logging.
-        * Existing largepage mappings are destroyed here and new ones will
-        * not be created until the end of the logging.
+        *
+        * All the sptes including the large sptes which point to this
+        * slot are set to readonly. We can not create any new large
+        * spte on this slot until the end of the logging.
+        *
+        * See the comments in fast_page_fault().
         */
        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
@@ -7248,6 +7347,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+               kvm_x86_ops->check_nested_events(vcpu, false);
+
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)