Merge branch 'x86-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / arch / x86 / kvm / vmx / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include <linux/frame.h>
20 #include <linux/highmem.h>
21 #include <linux/hrtimer.h>
22 #include <linux/kernel.h>
23 #include <linux/kvm_host.h>
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/mod_devicetable.h>
27 #include <linux/mm.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/tboot.h>
31 #include <linux/trace_events.h>
32
33 #include <asm/apic.h>
34 #include <asm/asm.h>
35 #include <asm/cpu.h>
36 #include <asm/debugreg.h>
37 #include <asm/desc.h>
38 #include <asm/fpu/internal.h>
39 #include <asm/io.h>
40 #include <asm/irq_remapping.h>
41 #include <asm/kexec.h>
42 #include <asm/perf_event.h>
43 #include <asm/mce.h>
44 #include <asm/mmu_context.h>
45 #include <asm/mshyperv.h>
46 #include <asm/spec-ctrl.h>
47 #include <asm/virtext.h>
48 #include <asm/vmx.h>
49
50 #include "capabilities.h"
51 #include "cpuid.h"
52 #include "evmcs.h"
53 #include "irq.h"
54 #include "kvm_cache_regs.h"
55 #include "lapic.h"
56 #include "mmu.h"
57 #include "nested.h"
58 #include "ops.h"
59 #include "pmu.h"
60 #include "trace.h"
61 #include "vmcs.h"
62 #include "vmcs12.h"
63 #include "vmx.h"
64 #include "x86.h"
65
66 MODULE_AUTHOR("Qumranet");
67 MODULE_LICENSE("GPL");
68
69 static const struct x86_cpu_id vmx_cpu_id[] = {
70         X86_FEATURE_MATCH(X86_FEATURE_VMX),
71         {}
72 };
73 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
74
75 bool __read_mostly enable_vpid = 1;
76 module_param_named(vpid, enable_vpid, bool, 0444);
77
78 static bool __read_mostly enable_vnmi = 1;
79 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
80
81 bool __read_mostly flexpriority_enabled = 1;
82 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
83
84 bool __read_mostly enable_ept = 1;
85 module_param_named(ept, enable_ept, bool, S_IRUGO);
86
87 bool __read_mostly enable_unrestricted_guest = 1;
88 module_param_named(unrestricted_guest,
89                         enable_unrestricted_guest, bool, S_IRUGO);
90
91 bool __read_mostly enable_ept_ad_bits = 1;
92 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
93
94 static bool __read_mostly emulate_invalid_guest_state = true;
95 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
96
97 static bool __read_mostly fasteoi = 1;
98 module_param(fasteoi, bool, S_IRUGO);
99
100 static bool __read_mostly enable_apicv = 1;
101 module_param(enable_apicv, bool, S_IRUGO);
102
103 /*
104  * If nested=1, nested virtualization is supported, i.e., guests may use
105  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
106  * use VMX instructions.
107  */
108 static bool __read_mostly nested = 1;
109 module_param(nested, bool, S_IRUGO);
110
111 static u64 __read_mostly host_xss;
112
113 bool __read_mostly enable_pml = 1;
114 module_param_named(pml, enable_pml, bool, S_IRUGO);
115
116 #define MSR_BITMAP_MODE_X2APIC          1
117 #define MSR_BITMAP_MODE_X2APIC_APICV    2
118
119 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
120
121 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
122 static int __read_mostly cpu_preemption_timer_multi;
123 static bool __read_mostly enable_preemption_timer = 1;
124 #ifdef CONFIG_X86_64
125 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
126 #endif
127
128 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
129 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
130 #define KVM_VM_CR0_ALWAYS_ON                            \
131         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
132          X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
133 #define KVM_CR4_GUEST_OWNED_BITS                                      \
134         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
135          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
136
137 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
138 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
139 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
140
141 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
142
143 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
144         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
145         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
146         RTIT_STATUS_BYTECNT))
147
148 #define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
149         (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
150
151 /*
152  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
153  * ple_gap:    upper bound on the amount of time between two successive
154  *             executions of PAUSE in a loop. Also indicate if ple enabled.
155  *             According to test, this time is usually smaller than 128 cycles.
156  * ple_window: upper bound on the amount of time a guest is allowed to execute
157  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
158  *             less than 2^12 cycles
159  * Time is measured based on a counter that runs at the same rate as the TSC,
160  * refer SDM volume 3b section 21.6.13 & 22.1.3.
161  */
162 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
163 module_param(ple_gap, uint, 0444);
164
165 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
166 module_param(ple_window, uint, 0444);
167
168 /* Default doubles per-vcpu window every exit. */
169 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
170 module_param(ple_window_grow, uint, 0444);
171
172 /* Default resets per-vcpu window every exit to ple_window. */
173 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
174 module_param(ple_window_shrink, uint, 0444);
175
176 /* Default is to compute the maximum so we can never overflow. */
177 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
178 module_param(ple_window_max, uint, 0444);
179
180 /* Default is SYSTEM mode, 1 for host-guest mode */
181 int __read_mostly pt_mode = PT_MODE_SYSTEM;
182 module_param(pt_mode, int, S_IRUGO);
183
184 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
185 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
186 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
187
188 /* Storage for pre module init parameter parsing */
189 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
190
191 static const struct {
192         const char *option;
193         bool for_parse;
194 } vmentry_l1d_param[] = {
195         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
196         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
197         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
198         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
199         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
200         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
201 };
202
203 #define L1D_CACHE_ORDER 4
204 static void *vmx_l1d_flush_pages;
205
206 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
207 {
208         struct page *page;
209         unsigned int i;
210
211         if (!enable_ept) {
212                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
213                 return 0;
214         }
215
216         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
217                 u64 msr;
218
219                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
220                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
221                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
222                         return 0;
223                 }
224         }
225
226         /* If set to auto use the default l1tf mitigation method */
227         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
228                 switch (l1tf_mitigation) {
229                 case L1TF_MITIGATION_OFF:
230                         l1tf = VMENTER_L1D_FLUSH_NEVER;
231                         break;
232                 case L1TF_MITIGATION_FLUSH_NOWARN:
233                 case L1TF_MITIGATION_FLUSH:
234                 case L1TF_MITIGATION_FLUSH_NOSMT:
235                         l1tf = VMENTER_L1D_FLUSH_COND;
236                         break;
237                 case L1TF_MITIGATION_FULL:
238                 case L1TF_MITIGATION_FULL_FORCE:
239                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
240                         break;
241                 }
242         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
243                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
244         }
245
246         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
247             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
248                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
249                 if (!page)
250                         return -ENOMEM;
251                 vmx_l1d_flush_pages = page_address(page);
252
253                 /*
254                  * Initialize each page with a different pattern in
255                  * order to protect against KSM in the nested
256                  * virtualization case.
257                  */
258                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
259                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
260                                PAGE_SIZE);
261                 }
262         }
263
264         l1tf_vmx_mitigation = l1tf;
265
266         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
267                 static_branch_enable(&vmx_l1d_should_flush);
268         else
269                 static_branch_disable(&vmx_l1d_should_flush);
270
271         if (l1tf == VMENTER_L1D_FLUSH_COND)
272                 static_branch_enable(&vmx_l1d_flush_cond);
273         else
274                 static_branch_disable(&vmx_l1d_flush_cond);
275         return 0;
276 }
277
278 static int vmentry_l1d_flush_parse(const char *s)
279 {
280         unsigned int i;
281
282         if (s) {
283                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
284                         if (vmentry_l1d_param[i].for_parse &&
285                             sysfs_streq(s, vmentry_l1d_param[i].option))
286                                 return i;
287                 }
288         }
289         return -EINVAL;
290 }
291
292 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
293 {
294         int l1tf, ret;
295
296         l1tf = vmentry_l1d_flush_parse(s);
297         if (l1tf < 0)
298                 return l1tf;
299
300         if (!boot_cpu_has(X86_BUG_L1TF))
301                 return 0;
302
303         /*
304          * Has vmx_init() run already? If not then this is the pre init
305          * parameter parsing. In that case just store the value and let
306          * vmx_init() do the proper setup after enable_ept has been
307          * established.
308          */
309         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
310                 vmentry_l1d_flush_param = l1tf;
311                 return 0;
312         }
313
314         mutex_lock(&vmx_l1d_flush_mutex);
315         ret = vmx_setup_l1d_flush(l1tf);
316         mutex_unlock(&vmx_l1d_flush_mutex);
317         return ret;
318 }
319
320 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
321 {
322         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
323                 return sprintf(s, "???\n");
324
325         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
326 }
327
328 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
329         .set = vmentry_l1d_flush_set,
330         .get = vmentry_l1d_flush_get,
331 };
332 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
333
334 static bool guest_state_valid(struct kvm_vcpu *vcpu);
335 static u32 vmx_segment_access_rights(struct kvm_segment *var);
336 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
337                                                           u32 msr, int type);
338
339 void vmx_vmexit(void);
340
341 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
342 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
343 /*
344  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
345  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
346  */
347 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
348
349 /*
350  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
351  * can find which vCPU should be waken up.
352  */
353 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
354 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
355
356 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
357 static DEFINE_SPINLOCK(vmx_vpid_lock);
358
359 struct vmcs_config vmcs_config;
360 struct vmx_capability vmx_capability;
361
362 #define VMX_SEGMENT_FIELD(seg)                                  \
363         [VCPU_SREG_##seg] = {                                   \
364                 .selector = GUEST_##seg##_SELECTOR,             \
365                 .base = GUEST_##seg##_BASE,                     \
366                 .limit = GUEST_##seg##_LIMIT,                   \
367                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
368         }
369
370 static const struct kvm_vmx_segment_field {
371         unsigned selector;
372         unsigned base;
373         unsigned limit;
374         unsigned ar_bytes;
375 } kvm_vmx_segment_fields[] = {
376         VMX_SEGMENT_FIELD(CS),
377         VMX_SEGMENT_FIELD(DS),
378         VMX_SEGMENT_FIELD(ES),
379         VMX_SEGMENT_FIELD(FS),
380         VMX_SEGMENT_FIELD(GS),
381         VMX_SEGMENT_FIELD(SS),
382         VMX_SEGMENT_FIELD(TR),
383         VMX_SEGMENT_FIELD(LDTR),
384 };
385
386 u64 host_efer;
387
388 /*
389  * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
390  * will emulate SYSCALL in legacy mode if the vendor string in guest
391  * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
392  * support this emulation, IA32_STAR must always be included in
393  * vmx_msr_index[], even in i386 builds.
394  */
395 const u32 vmx_msr_index[] = {
396 #ifdef CONFIG_X86_64
397         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
398 #endif
399         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
400 };
401
402 #if IS_ENABLED(CONFIG_HYPERV)
403 static bool __read_mostly enlightened_vmcs = true;
404 module_param(enlightened_vmcs, bool, 0444);
405
406 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
407 static void check_ept_pointer_match(struct kvm *kvm)
408 {
409         struct kvm_vcpu *vcpu;
410         u64 tmp_eptp = INVALID_PAGE;
411         int i;
412
413         kvm_for_each_vcpu(i, vcpu, kvm) {
414                 if (!VALID_PAGE(tmp_eptp)) {
415                         tmp_eptp = to_vmx(vcpu)->ept_pointer;
416                 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
417                         to_kvm_vmx(kvm)->ept_pointers_match
418                                 = EPT_POINTERS_MISMATCH;
419                         return;
420                 }
421         }
422
423         to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
424 }
425
426 int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
427                 void *data)
428 {
429         struct kvm_tlb_range *range = data;
430
431         return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
432                         range->pages);
433 }
434
435 static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
436                 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
437 {
438         u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
439
440         /*
441          * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
442          * of the base of EPT PML4 table, strip off EPT configuration
443          * information.
444          */
445         if (range)
446                 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
447                                 kvm_fill_hv_flush_list_func, (void *)range);
448         else
449                 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
450 }
451
452 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
453                 struct kvm_tlb_range *range)
454 {
455         struct kvm_vcpu *vcpu;
456         int ret = -ENOTSUPP, i;
457
458         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
459
460         if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
461                 check_ept_pointer_match(kvm);
462
463         if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
464                 kvm_for_each_vcpu(i, vcpu, kvm) {
465                         /* If ept_pointer is invalid pointer, bypass flush request. */
466                         if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
467                                 ret |= __hv_remote_flush_tlb_with_range(
468                                         kvm, vcpu, range);
469                 }
470         } else {
471                 ret = __hv_remote_flush_tlb_with_range(kvm,
472                                 kvm_get_vcpu(kvm, 0), range);
473         }
474
475         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
476         return ret;
477 }
478 static int hv_remote_flush_tlb(struct kvm *kvm)
479 {
480         return hv_remote_flush_tlb_with_range(kvm, NULL);
481 }
482
483 #endif /* IS_ENABLED(CONFIG_HYPERV) */
484
485 /*
486  * Comment's format: document - errata name - stepping - processor name.
487  * Refer from
488  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
489  */
490 static u32 vmx_preemption_cpu_tfms[] = {
491 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
492 0x000206E6,
493 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
494 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
495 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
496 0x00020652,
497 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
498 0x00020655,
499 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
500 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
501 /*
502  * 320767.pdf - AAP86  - B1 -
503  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
504  */
505 0x000106E5,
506 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
507 0x000106A0,
508 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
509 0x000106A1,
510 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
511 0x000106A4,
512  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
513  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
514  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
515 0x000106A5,
516  /* Xeon E3-1220 V2 */
517 0x000306A8,
518 };
519
520 static inline bool cpu_has_broken_vmx_preemption_timer(void)
521 {
522         u32 eax = cpuid_eax(0x00000001), i;
523
524         /* Clear the reserved bits */
525         eax &= ~(0x3U << 14 | 0xfU << 28);
526         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
527                 if (eax == vmx_preemption_cpu_tfms[i])
528                         return true;
529
530         return false;
531 }
532
533 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
534 {
535         return flexpriority_enabled && lapic_in_kernel(vcpu);
536 }
537
538 static inline bool report_flexpriority(void)
539 {
540         return flexpriority_enabled;
541 }
542
543 static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
544 {
545         int i;
546
547         for (i = 0; i < vmx->nmsrs; ++i)
548                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
549                         return i;
550         return -1;
551 }
552
553 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
554 {
555         int i;
556
557         i = __find_msr_index(vmx, msr);
558         if (i >= 0)
559                 return &vmx->guest_msrs[i];
560         return NULL;
561 }
562
563 void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
564 {
565         vmcs_clear(loaded_vmcs->vmcs);
566         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
567                 vmcs_clear(loaded_vmcs->shadow_vmcs);
568         loaded_vmcs->cpu = -1;
569         loaded_vmcs->launched = 0;
570 }
571
572 #ifdef CONFIG_KEXEC_CORE
573 /*
574  * This bitmap is used to indicate whether the vmclear
575  * operation is enabled on all cpus. All disabled by
576  * default.
577  */
578 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
579
580 static inline void crash_enable_local_vmclear(int cpu)
581 {
582         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
583 }
584
585 static inline void crash_disable_local_vmclear(int cpu)
586 {
587         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
588 }
589
590 static inline int crash_local_vmclear_enabled(int cpu)
591 {
592         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
593 }
594
595 static void crash_vmclear_local_loaded_vmcss(void)
596 {
597         int cpu = raw_smp_processor_id();
598         struct loaded_vmcs *v;
599
600         if (!crash_local_vmclear_enabled(cpu))
601                 return;
602
603         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
604                             loaded_vmcss_on_cpu_link)
605                 vmcs_clear(v->vmcs);
606 }
607 #else
608 static inline void crash_enable_local_vmclear(int cpu) { }
609 static inline void crash_disable_local_vmclear(int cpu) { }
610 #endif /* CONFIG_KEXEC_CORE */
611
612 static void __loaded_vmcs_clear(void *arg)
613 {
614         struct loaded_vmcs *loaded_vmcs = arg;
615         int cpu = raw_smp_processor_id();
616
617         if (loaded_vmcs->cpu != cpu)
618                 return; /* vcpu migration can race with cpu offline */
619         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
620                 per_cpu(current_vmcs, cpu) = NULL;
621         crash_disable_local_vmclear(cpu);
622         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
623
624         /*
625          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
626          * is before setting loaded_vmcs->vcpu to -1 which is done in
627          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
628          * then adds the vmcs into percpu list before it is deleted.
629          */
630         smp_wmb();
631
632         loaded_vmcs_init(loaded_vmcs);
633         crash_enable_local_vmclear(cpu);
634 }
635
636 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
637 {
638         int cpu = loaded_vmcs->cpu;
639
640         if (cpu != -1)
641                 smp_call_function_single(cpu,
642                          __loaded_vmcs_clear, loaded_vmcs, 1);
643 }
644
645 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
646                                        unsigned field)
647 {
648         bool ret;
649         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
650
651         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
652                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
653                 vmx->segment_cache.bitmask = 0;
654         }
655         ret = vmx->segment_cache.bitmask & mask;
656         vmx->segment_cache.bitmask |= mask;
657         return ret;
658 }
659
660 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
661 {
662         u16 *p = &vmx->segment_cache.seg[seg].selector;
663
664         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
665                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
666         return *p;
667 }
668
669 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
670 {
671         ulong *p = &vmx->segment_cache.seg[seg].base;
672
673         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
674                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
675         return *p;
676 }
677
678 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
679 {
680         u32 *p = &vmx->segment_cache.seg[seg].limit;
681
682         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
683                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
684         return *p;
685 }
686
687 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
688 {
689         u32 *p = &vmx->segment_cache.seg[seg].ar;
690
691         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
692                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
693         return *p;
694 }
695
696 void update_exception_bitmap(struct kvm_vcpu *vcpu)
697 {
698         u32 eb;
699
700         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
701              (1u << DB_VECTOR) | (1u << AC_VECTOR);
702         /*
703          * Guest access to VMware backdoor ports could legitimately
704          * trigger #GP because of TSS I/O permission bitmap.
705          * We intercept those #GP and allow access to them anyway
706          * as VMware does.
707          */
708         if (enable_vmware_backdoor)
709                 eb |= (1u << GP_VECTOR);
710         if ((vcpu->guest_debug &
711              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
712             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
713                 eb |= 1u << BP_VECTOR;
714         if (to_vmx(vcpu)->rmode.vm86_active)
715                 eb = ~0;
716         if (enable_ept)
717                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
718
719         /* When we are running a nested L2 guest and L1 specified for it a
720          * certain exception bitmap, we must trap the same exceptions and pass
721          * them to L1. When running L2, we will only handle the exceptions
722          * specified above if L1 did not want them.
723          */
724         if (is_guest_mode(vcpu))
725                 eb |= get_vmcs12(vcpu)->exception_bitmap;
726
727         vmcs_write32(EXCEPTION_BITMAP, eb);
728 }
729
730 /*
731  * Check if MSR is intercepted for currently loaded MSR bitmap.
732  */
733 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
734 {
735         unsigned long *msr_bitmap;
736         int f = sizeof(unsigned long);
737
738         if (!cpu_has_vmx_msr_bitmap())
739                 return true;
740
741         msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
742
743         if (msr <= 0x1fff) {
744                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
745         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
746                 msr &= 0x1fff;
747                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
748         }
749
750         return true;
751 }
752
753 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
754                 unsigned long entry, unsigned long exit)
755 {
756         vm_entry_controls_clearbit(vmx, entry);
757         vm_exit_controls_clearbit(vmx, exit);
758 }
759
760 static int find_msr(struct vmx_msrs *m, unsigned int msr)
761 {
762         unsigned int i;
763
764         for (i = 0; i < m->nr; ++i) {
765                 if (m->val[i].index == msr)
766                         return i;
767         }
768         return -ENOENT;
769 }
770
771 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
772 {
773         int i;
774         struct msr_autoload *m = &vmx->msr_autoload;
775
776         switch (msr) {
777         case MSR_EFER:
778                 if (cpu_has_load_ia32_efer()) {
779                         clear_atomic_switch_msr_special(vmx,
780                                         VM_ENTRY_LOAD_IA32_EFER,
781                                         VM_EXIT_LOAD_IA32_EFER);
782                         return;
783                 }
784                 break;
785         case MSR_CORE_PERF_GLOBAL_CTRL:
786                 if (cpu_has_load_perf_global_ctrl()) {
787                         clear_atomic_switch_msr_special(vmx,
788                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
789                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
790                         return;
791                 }
792                 break;
793         }
794         i = find_msr(&m->guest, msr);
795         if (i < 0)
796                 goto skip_guest;
797         --m->guest.nr;
798         m->guest.val[i] = m->guest.val[m->guest.nr];
799         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
800
801 skip_guest:
802         i = find_msr(&m->host, msr);
803         if (i < 0)
804                 return;
805
806         --m->host.nr;
807         m->host.val[i] = m->host.val[m->host.nr];
808         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
809 }
810
811 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
812                 unsigned long entry, unsigned long exit,
813                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
814                 u64 guest_val, u64 host_val)
815 {
816         vmcs_write64(guest_val_vmcs, guest_val);
817         if (host_val_vmcs != HOST_IA32_EFER)
818                 vmcs_write64(host_val_vmcs, host_val);
819         vm_entry_controls_setbit(vmx, entry);
820         vm_exit_controls_setbit(vmx, exit);
821 }
822
823 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
824                                   u64 guest_val, u64 host_val, bool entry_only)
825 {
826         int i, j = 0;
827         struct msr_autoload *m = &vmx->msr_autoload;
828
829         switch (msr) {
830         case MSR_EFER:
831                 if (cpu_has_load_ia32_efer()) {
832                         add_atomic_switch_msr_special(vmx,
833                                         VM_ENTRY_LOAD_IA32_EFER,
834                                         VM_EXIT_LOAD_IA32_EFER,
835                                         GUEST_IA32_EFER,
836                                         HOST_IA32_EFER,
837                                         guest_val, host_val);
838                         return;
839                 }
840                 break;
841         case MSR_CORE_PERF_GLOBAL_CTRL:
842                 if (cpu_has_load_perf_global_ctrl()) {
843                         add_atomic_switch_msr_special(vmx,
844                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
845                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
846                                         GUEST_IA32_PERF_GLOBAL_CTRL,
847                                         HOST_IA32_PERF_GLOBAL_CTRL,
848                                         guest_val, host_val);
849                         return;
850                 }
851                 break;
852         case MSR_IA32_PEBS_ENABLE:
853                 /* PEBS needs a quiescent period after being disabled (to write
854                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
855                  * provide that period, so a CPU could write host's record into
856                  * guest's memory.
857                  */
858                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
859         }
860
861         i = find_msr(&m->guest, msr);
862         if (!entry_only)
863                 j = find_msr(&m->host, msr);
864
865         if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
866                 printk_once(KERN_WARNING "Not enough msr switch entries. "
867                                 "Can't add msr %x\n", msr);
868                 return;
869         }
870         if (i < 0) {
871                 i = m->guest.nr++;
872                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
873         }
874         m->guest.val[i].index = msr;
875         m->guest.val[i].value = guest_val;
876
877         if (entry_only)
878                 return;
879
880         if (j < 0) {
881                 j = m->host.nr++;
882                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
883         }
884         m->host.val[j].index = msr;
885         m->host.val[j].value = host_val;
886 }
887
888 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
889 {
890         u64 guest_efer = vmx->vcpu.arch.efer;
891         u64 ignore_bits = 0;
892
893         if (!enable_ept) {
894                 /*
895                  * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
896                  * host CPUID is more efficient than testing guest CPUID
897                  * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
898                  */
899                 if (boot_cpu_has(X86_FEATURE_SMEP))
900                         guest_efer |= EFER_NX;
901                 else if (!(guest_efer & EFER_NX))
902                         ignore_bits |= EFER_NX;
903         }
904
905         /*
906          * LMA and LME handled by hardware; SCE meaningless outside long mode.
907          */
908         ignore_bits |= EFER_SCE;
909 #ifdef CONFIG_X86_64
910         ignore_bits |= EFER_LMA | EFER_LME;
911         /* SCE is meaningful only in long mode on Intel */
912         if (guest_efer & EFER_LMA)
913                 ignore_bits &= ~(u64)EFER_SCE;
914 #endif
915
916         /*
917          * On EPT, we can't emulate NX, so we must switch EFER atomically.
918          * On CPUs that support "load IA32_EFER", always switch EFER
919          * atomically, since it's faster than switching it manually.
920          */
921         if (cpu_has_load_ia32_efer() ||
922             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
923                 if (!(guest_efer & EFER_LMA))
924                         guest_efer &= ~EFER_LME;
925                 if (guest_efer != host_efer)
926                         add_atomic_switch_msr(vmx, MSR_EFER,
927                                               guest_efer, host_efer, false);
928                 else
929                         clear_atomic_switch_msr(vmx, MSR_EFER);
930                 return false;
931         } else {
932                 clear_atomic_switch_msr(vmx, MSR_EFER);
933
934                 guest_efer &= ~ignore_bits;
935                 guest_efer |= host_efer & ignore_bits;
936
937                 vmx->guest_msrs[efer_offset].data = guest_efer;
938                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
939
940                 return true;
941         }
942 }
943
944 #ifdef CONFIG_X86_32
945 /*
946  * On 32-bit kernels, VM exits still load the FS and GS bases from the
947  * VMCS rather than the segment table.  KVM uses this helper to figure
948  * out the current bases to poke them into the VMCS before entry.
949  */
950 static unsigned long segment_base(u16 selector)
951 {
952         struct desc_struct *table;
953         unsigned long v;
954
955         if (!(selector & ~SEGMENT_RPL_MASK))
956                 return 0;
957
958         table = get_current_gdt_ro();
959
960         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
961                 u16 ldt_selector = kvm_read_ldt();
962
963                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
964                         return 0;
965
966                 table = (struct desc_struct *)segment_base(ldt_selector);
967         }
968         v = get_desc_base(&table[selector >> 3]);
969         return v;
970 }
971 #endif
972
973 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
974 {
975         u32 i;
976
977         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
978         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
979         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
980         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
981         for (i = 0; i < addr_range; i++) {
982                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
983                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
984         }
985 }
986
987 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
988 {
989         u32 i;
990
991         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
992         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
993         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
994         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
995         for (i = 0; i < addr_range; i++) {
996                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
997                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
998         }
999 }
1000
1001 static void pt_guest_enter(struct vcpu_vmx *vmx)
1002 {
1003         if (pt_mode == PT_MODE_SYSTEM)
1004                 return;
1005
1006         /*
1007          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1008          * Save host state before VM entry.
1009          */
1010         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1011         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1012                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1013                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1014                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1015         }
1016 }
1017
1018 static void pt_guest_exit(struct vcpu_vmx *vmx)
1019 {
1020         if (pt_mode == PT_MODE_SYSTEM)
1021                 return;
1022
1023         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1024                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1025                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1026         }
1027
1028         /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1029         wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1030 }
1031
1032 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1033 {
1034         struct vcpu_vmx *vmx = to_vmx(vcpu);
1035         struct vmcs_host_state *host_state;
1036 #ifdef CONFIG_X86_64
1037         int cpu = raw_smp_processor_id();
1038 #endif
1039         unsigned long fs_base, gs_base;
1040         u16 fs_sel, gs_sel;
1041         int i;
1042
1043         vmx->req_immediate_exit = false;
1044
1045         /*
1046          * Note that guest MSRs to be saved/restored can also be changed
1047          * when guest state is loaded. This happens when guest transitions
1048          * to/from long-mode by setting MSR_EFER.LMA.
1049          */
1050         if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
1051                 vmx->guest_msrs_dirty = false;
1052                 for (i = 0; i < vmx->save_nmsrs; ++i)
1053                         kvm_set_shared_msr(vmx->guest_msrs[i].index,
1054                                            vmx->guest_msrs[i].data,
1055                                            vmx->guest_msrs[i].mask);
1056
1057         }
1058
1059         if (vmx->loaded_cpu_state)
1060                 return;
1061
1062         vmx->loaded_cpu_state = vmx->loaded_vmcs;
1063         host_state = &vmx->loaded_cpu_state->host_state;
1064
1065         /*
1066          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1067          * allow segment selectors with cpl > 0 or ti == 1.
1068          */
1069         host_state->ldt_sel = kvm_read_ldt();
1070
1071 #ifdef CONFIG_X86_64
1072         savesegment(ds, host_state->ds_sel);
1073         savesegment(es, host_state->es_sel);
1074
1075         gs_base = cpu_kernelmode_gs_base(cpu);
1076         if (likely(is_64bit_mm(current->mm))) {
1077                 save_fsgs_for_kvm();
1078                 fs_sel = current->thread.fsindex;
1079                 gs_sel = current->thread.gsindex;
1080                 fs_base = current->thread.fsbase;
1081                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1082         } else {
1083                 savesegment(fs, fs_sel);
1084                 savesegment(gs, gs_sel);
1085                 fs_base = read_msr(MSR_FS_BASE);
1086                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1087         }
1088
1089         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1090 #else
1091         savesegment(fs, fs_sel);
1092         savesegment(gs, gs_sel);
1093         fs_base = segment_base(fs_sel);
1094         gs_base = segment_base(gs_sel);
1095 #endif
1096
1097         if (unlikely(fs_sel != host_state->fs_sel)) {
1098                 if (!(fs_sel & 7))
1099                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1100                 else
1101                         vmcs_write16(HOST_FS_SELECTOR, 0);
1102                 host_state->fs_sel = fs_sel;
1103         }
1104         if (unlikely(gs_sel != host_state->gs_sel)) {
1105                 if (!(gs_sel & 7))
1106                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1107                 else
1108                         vmcs_write16(HOST_GS_SELECTOR, 0);
1109                 host_state->gs_sel = gs_sel;
1110         }
1111         if (unlikely(fs_base != host_state->fs_base)) {
1112                 vmcs_writel(HOST_FS_BASE, fs_base);
1113                 host_state->fs_base = fs_base;
1114         }
1115         if (unlikely(gs_base != host_state->gs_base)) {
1116                 vmcs_writel(HOST_GS_BASE, gs_base);
1117                 host_state->gs_base = gs_base;
1118         }
1119 }
1120
1121 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1122 {
1123         struct vmcs_host_state *host_state;
1124
1125         if (!vmx->loaded_cpu_state)
1126                 return;
1127
1128         WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
1129         host_state = &vmx->loaded_cpu_state->host_state;
1130
1131         ++vmx->vcpu.stat.host_state_reload;
1132         vmx->loaded_cpu_state = NULL;
1133
1134 #ifdef CONFIG_X86_64
1135         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1136 #endif
1137         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1138                 kvm_load_ldt(host_state->ldt_sel);
1139 #ifdef CONFIG_X86_64
1140                 load_gs_index(host_state->gs_sel);
1141 #else
1142                 loadsegment(gs, host_state->gs_sel);
1143 #endif
1144         }
1145         if (host_state->fs_sel & 7)
1146                 loadsegment(fs, host_state->fs_sel);
1147 #ifdef CONFIG_X86_64
1148         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1149                 loadsegment(ds, host_state->ds_sel);
1150                 loadsegment(es, host_state->es_sel);
1151         }
1152 #endif
1153         invalidate_tss_limit();
1154 #ifdef CONFIG_X86_64
1155         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1156 #endif
1157         load_fixmap_gdt(raw_smp_processor_id());
1158 }
1159
1160 #ifdef CONFIG_X86_64
1161 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1162 {
1163         preempt_disable();
1164         if (vmx->loaded_cpu_state)
1165                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1166         preempt_enable();
1167         return vmx->msr_guest_kernel_gs_base;
1168 }
1169
1170 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1171 {
1172         preempt_disable();
1173         if (vmx->loaded_cpu_state)
1174                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1175         preempt_enable();
1176         vmx->msr_guest_kernel_gs_base = data;
1177 }
1178 #endif
1179
1180 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1181 {
1182         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1183         struct pi_desc old, new;
1184         unsigned int dest;
1185
1186         /*
1187          * In case of hot-plug or hot-unplug, we may have to undo
1188          * vmx_vcpu_pi_put even if there is no assigned device.  And we
1189          * always keep PI.NDST up to date for simplicity: it makes the
1190          * code easier, and CPU migration is not a fast path.
1191          */
1192         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
1193                 return;
1194
1195         /*
1196          * First handle the simple case where no cmpxchg is necessary; just
1197          * allow posting non-urgent interrupts.
1198          *
1199          * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
1200          * PI.NDST: pi_post_block will do it for us and the wakeup_handler
1201          * expects the VCPU to be on the blocked_vcpu_list that matches
1202          * PI.NDST.
1203          */
1204         if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
1205             vcpu->cpu == cpu) {
1206                 pi_clear_sn(pi_desc);
1207                 return;
1208         }
1209
1210         /* The full case.  */
1211         do {
1212                 old.control = new.control = pi_desc->control;
1213
1214                 dest = cpu_physical_id(cpu);
1215
1216                 if (x2apic_enabled())
1217                         new.ndst = dest;
1218                 else
1219                         new.ndst = (dest << 8) & 0xFF00;
1220
1221                 new.sn = 0;
1222         } while (cmpxchg64(&pi_desc->control, old.control,
1223                            new.control) != old.control);
1224 }
1225
1226 /*
1227  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1228  * vcpu mutex is already taken.
1229  */
1230 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1231 {
1232         struct vcpu_vmx *vmx = to_vmx(vcpu);
1233         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1234
1235         if (!already_loaded) {
1236                 loaded_vmcs_clear(vmx->loaded_vmcs);
1237                 local_irq_disable();
1238                 crash_disable_local_vmclear(cpu);
1239
1240                 /*
1241                  * Read loaded_vmcs->cpu should be before fetching
1242                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
1243                  * See the comments in __loaded_vmcs_clear().
1244                  */
1245                 smp_rmb();
1246
1247                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1248                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1249                 crash_enable_local_vmclear(cpu);
1250                 local_irq_enable();
1251         }
1252
1253         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1254                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1255                 vmcs_load(vmx->loaded_vmcs->vmcs);
1256                 indirect_branch_prediction_barrier();
1257         }
1258
1259         if (!already_loaded) {
1260                 void *gdt = get_current_gdt_ro();
1261                 unsigned long sysenter_esp;
1262
1263                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1264
1265                 /*
1266                  * Linux uses per-cpu TSS and GDT, so set these when switching
1267                  * processors.  See 22.2.4.
1268                  */
1269                 vmcs_writel(HOST_TR_BASE,
1270                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1271                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1272
1273                 /*
1274                  * VM exits change the host TR limit to 0x67 after a VM
1275                  * exit.  This is okay, since 0x67 covers everything except
1276                  * the IO bitmap and have have code to handle the IO bitmap
1277                  * being lost after a VM exit.
1278                  */
1279                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
1280
1281                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1282                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1283
1284                 vmx->loaded_vmcs->cpu = cpu;
1285         }
1286
1287         /* Setup TSC multiplier */
1288         if (kvm_has_tsc_control &&
1289             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1290                 decache_tsc_multiplier(vmx);
1291
1292         vmx_vcpu_pi_load(vcpu, cpu);
1293         vmx->host_pkru = read_pkru();
1294         vmx->host_debugctlmsr = get_debugctlmsr();
1295 }
1296
1297 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
1298 {
1299         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1300
1301         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
1302                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
1303                 !kvm_vcpu_apicv_active(vcpu))
1304                 return;
1305
1306         /* Set SN when the vCPU is preempted */
1307         if (vcpu->preempted)
1308                 pi_set_sn(pi_desc);
1309 }
1310
1311 void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1312 {
1313         vmx_vcpu_pi_put(vcpu);
1314
1315         vmx_prepare_switch_to_host(to_vmx(vcpu));
1316 }
1317
1318 static bool emulation_required(struct kvm_vcpu *vcpu)
1319 {
1320         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
1321 }
1322
1323 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1324
1325 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1326 {
1327         unsigned long rflags, save_rflags;
1328
1329         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1330                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1331                 rflags = vmcs_readl(GUEST_RFLAGS);
1332                 if (to_vmx(vcpu)->rmode.vm86_active) {
1333                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1334                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1335                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1336                 }
1337                 to_vmx(vcpu)->rflags = rflags;
1338         }
1339         return to_vmx(vcpu)->rflags;
1340 }
1341
1342 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1343 {
1344         unsigned long old_rflags = vmx_get_rflags(vcpu);
1345
1346         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1347         to_vmx(vcpu)->rflags = rflags;
1348         if (to_vmx(vcpu)->rmode.vm86_active) {
1349                 to_vmx(vcpu)->rmode.save_rflags = rflags;
1350                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1351         }
1352         vmcs_writel(GUEST_RFLAGS, rflags);
1353
1354         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
1355                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
1356 }
1357
1358 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1359 {
1360         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1361         int ret = 0;
1362
1363         if (interruptibility & GUEST_INTR_STATE_STI)
1364                 ret |= KVM_X86_SHADOW_INT_STI;
1365         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1366                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1367
1368         return ret;
1369 }
1370
1371 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1372 {
1373         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1374         u32 interruptibility = interruptibility_old;
1375
1376         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1377
1378         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1379                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1380         else if (mask & KVM_X86_SHADOW_INT_STI)
1381                 interruptibility |= GUEST_INTR_STATE_STI;
1382
1383         if ((interruptibility != interruptibility_old))
1384                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1385 }
1386
1387 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1388 {
1389         struct vcpu_vmx *vmx = to_vmx(vcpu);
1390         unsigned long value;
1391
1392         /*
1393          * Any MSR write that attempts to change bits marked reserved will
1394          * case a #GP fault.
1395          */
1396         if (data & vmx->pt_desc.ctl_bitmask)
1397                 return 1;
1398
1399         /*
1400          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1401          * result in a #GP unless the same write also clears TraceEn.
1402          */
1403         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1404                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1405                 return 1;
1406
1407         /*
1408          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1409          * and FabricEn would cause #GP, if
1410          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1411          */
1412         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1413                 !(data & RTIT_CTL_FABRIC_EN) &&
1414                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1415                                         PT_CAP_single_range_output))
1416                 return 1;
1417
1418         /*
1419          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1420          * utilize encodings marked reserved will casue a #GP fault.
1421          */
1422         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1423         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1424                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1425                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1426                 return 1;
1427         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1428                                                 PT_CAP_cycle_thresholds);
1429         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1430                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1431                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1432                 return 1;
1433         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1434         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1435                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1436                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1437                 return 1;
1438
1439         /*
1440          * If ADDRx_CFG is reserved or the encodings is >2 will
1441          * cause a #GP fault.
1442          */
1443         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1444         if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
1445                 return 1;
1446         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1447         if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
1448                 return 1;
1449         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1450         if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
1451                 return 1;
1452         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1453         if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
1454                 return 1;
1455
1456         return 0;
1457 }
1458
1459
1460 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1461 {
1462         unsigned long rip;
1463
1464         rip = kvm_rip_read(vcpu);
1465         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1466         kvm_rip_write(vcpu, rip);
1467
1468         /* skipping an emulated instruction also counts */
1469         vmx_set_interrupt_shadow(vcpu, 0);
1470 }
1471
1472 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1473 {
1474         /*
1475          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1476          * explicitly skip the instruction because if the HLT state is set,
1477          * then the instruction is already executing and RIP has already been
1478          * advanced.
1479          */
1480         if (kvm_hlt_in_guest(vcpu->kvm) &&
1481                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1482                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1483 }
1484
1485 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1486 {
1487         struct vcpu_vmx *vmx = to_vmx(vcpu);
1488         unsigned nr = vcpu->arch.exception.nr;
1489         bool has_error_code = vcpu->arch.exception.has_error_code;
1490         u32 error_code = vcpu->arch.exception.error_code;
1491         u32 intr_info = nr | INTR_INFO_VALID_MASK;
1492
1493         kvm_deliver_exception_payload(vcpu);
1494
1495         if (has_error_code) {
1496                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1497                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1498         }
1499
1500         if (vmx->rmode.vm86_active) {
1501                 int inc_eip = 0;
1502                 if (kvm_exception_is_soft(nr))
1503                         inc_eip = vcpu->arch.event_exit_inst_len;
1504                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1505                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1506                 return;
1507         }
1508
1509         WARN_ON_ONCE(vmx->emulation_required);
1510
1511         if (kvm_exception_is_soft(nr)) {
1512                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1513                              vmx->vcpu.arch.event_exit_inst_len);
1514                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1515         } else
1516                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1517
1518         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1519
1520         vmx_clear_hlt(vcpu);
1521 }
1522
1523 static bool vmx_rdtscp_supported(void)
1524 {
1525         return cpu_has_vmx_rdtscp();
1526 }
1527
1528 static bool vmx_invpcid_supported(void)
1529 {
1530         return cpu_has_vmx_invpcid();
1531 }
1532
1533 /*
1534  * Swap MSR entry in host/guest MSR entry array.
1535  */
1536 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1537 {
1538         struct shared_msr_entry tmp;
1539
1540         tmp = vmx->guest_msrs[to];
1541         vmx->guest_msrs[to] = vmx->guest_msrs[from];
1542         vmx->guest_msrs[from] = tmp;
1543 }
1544
1545 /*
1546  * Set up the vmcs to automatically save and restore system
1547  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
1548  * mode, as fiddling with msrs is very expensive.
1549  */
1550 static void setup_msrs(struct vcpu_vmx *vmx)
1551 {
1552         int save_nmsrs, index;
1553
1554         save_nmsrs = 0;
1555 #ifdef CONFIG_X86_64
1556         /*
1557          * The SYSCALL MSRs are only needed on long mode guests, and only
1558          * when EFER.SCE is set.
1559          */
1560         if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
1561                 index = __find_msr_index(vmx, MSR_STAR);
1562                 if (index >= 0)
1563                         move_msr_up(vmx, index, save_nmsrs++);
1564                 index = __find_msr_index(vmx, MSR_LSTAR);
1565                 if (index >= 0)
1566                         move_msr_up(vmx, index, save_nmsrs++);
1567                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1568                 if (index >= 0)
1569                         move_msr_up(vmx, index, save_nmsrs++);
1570         }
1571 #endif
1572         index = __find_msr_index(vmx, MSR_EFER);
1573         if (index >= 0 && update_transition_efer(vmx, index))
1574                 move_msr_up(vmx, index, save_nmsrs++);
1575         index = __find_msr_index(vmx, MSR_TSC_AUX);
1576         if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
1577                 move_msr_up(vmx, index, save_nmsrs++);
1578
1579         vmx->save_nmsrs = save_nmsrs;
1580         vmx->guest_msrs_dirty = true;
1581
1582         if (cpu_has_vmx_msr_bitmap())
1583                 vmx_update_msr_bitmap(&vmx->vcpu);
1584 }
1585
1586 static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
1587 {
1588         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1589
1590         if (is_guest_mode(vcpu) &&
1591             (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
1592                 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
1593
1594         return vcpu->arch.tsc_offset;
1595 }
1596
1597 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1598 {
1599         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1600         u64 g_tsc_offset = 0;
1601
1602         /*
1603          * We're here if L1 chose not to trap WRMSR to TSC. According
1604          * to the spec, this should set L1's TSC; The offset that L1
1605          * set for L2 remains unchanged, and still needs to be added
1606          * to the newly set TSC to get L2's TSC.
1607          */
1608         if (is_guest_mode(vcpu) &&
1609             (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
1610                 g_tsc_offset = vmcs12->tsc_offset;
1611
1612         trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1613                                    vcpu->arch.tsc_offset - g_tsc_offset,
1614                                    offset);
1615         vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
1616         return offset + g_tsc_offset;
1617 }
1618
1619 /*
1620  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1621  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1622  * all guests if the "nested" module option is off, and can also be disabled
1623  * for a single guest by disabling its VMX cpuid bit.
1624  */
1625 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1626 {
1627         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1628 }
1629
1630 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1631                                                  uint64_t val)
1632 {
1633         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1634
1635         return !(val & ~valid_bits);
1636 }
1637
1638 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1639 {
1640         switch (msr->index) {
1641         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1642                 if (!nested)
1643                         return 1;
1644                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1645         default:
1646                 return 1;
1647         }
1648
1649         return 0;
1650 }
1651
1652 /*
1653  * Reads an msr value (of 'msr_index') into 'pdata'.
1654  * Returns 0 on success, non-0 otherwise.
1655  * Assumes vcpu_load() was already called.
1656  */
1657 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1658 {
1659         struct vcpu_vmx *vmx = to_vmx(vcpu);
1660         struct shared_msr_entry *msr;
1661         u32 index;
1662
1663         switch (msr_info->index) {
1664 #ifdef CONFIG_X86_64
1665         case MSR_FS_BASE:
1666                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1667                 break;
1668         case MSR_GS_BASE:
1669                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1670                 break;
1671         case MSR_KERNEL_GS_BASE:
1672                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1673                 break;
1674 #endif
1675         case MSR_EFER:
1676                 return kvm_get_msr_common(vcpu, msr_info);
1677         case MSR_IA32_SPEC_CTRL:
1678                 if (!msr_info->host_initiated &&
1679                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1680                         return 1;
1681
1682                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1683                 break;
1684         case MSR_IA32_ARCH_CAPABILITIES:
1685                 if (!msr_info->host_initiated &&
1686                     !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
1687                         return 1;
1688                 msr_info->data = to_vmx(vcpu)->arch_capabilities;
1689                 break;
1690         case MSR_IA32_SYSENTER_CS:
1691                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1692                 break;
1693         case MSR_IA32_SYSENTER_EIP:
1694                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
1695                 break;
1696         case MSR_IA32_SYSENTER_ESP:
1697                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1698                 break;
1699         case MSR_IA32_BNDCFGS:
1700                 if (!kvm_mpx_supported() ||
1701                     (!msr_info->host_initiated &&
1702                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1703                         return 1;
1704                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1705                 break;
1706         case MSR_IA32_MCG_EXT_CTL:
1707                 if (!msr_info->host_initiated &&
1708                     !(vmx->msr_ia32_feature_control &
1709                       FEATURE_CONTROL_LMCE))
1710                         return 1;
1711                 msr_info->data = vcpu->arch.mcg_ext_ctl;
1712                 break;
1713         case MSR_IA32_FEATURE_CONTROL:
1714                 msr_info->data = vmx->msr_ia32_feature_control;
1715                 break;
1716         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1717                 if (!nested_vmx_allowed(vcpu))
1718                         return 1;
1719                 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1720                                        &msr_info->data);
1721         case MSR_IA32_XSS:
1722                 if (!vmx_xsaves_supported())
1723                         return 1;
1724                 msr_info->data = vcpu->arch.ia32_xss;
1725                 break;
1726         case MSR_IA32_RTIT_CTL:
1727                 if (pt_mode != PT_MODE_HOST_GUEST)
1728                         return 1;
1729                 msr_info->data = vmx->pt_desc.guest.ctl;
1730                 break;
1731         case MSR_IA32_RTIT_STATUS:
1732                 if (pt_mode != PT_MODE_HOST_GUEST)
1733                         return 1;
1734                 msr_info->data = vmx->pt_desc.guest.status;
1735                 break;
1736         case MSR_IA32_RTIT_CR3_MATCH:
1737                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1738                         !intel_pt_validate_cap(vmx->pt_desc.caps,
1739                                                 PT_CAP_cr3_filtering))
1740                         return 1;
1741                 msr_info->data = vmx->pt_desc.guest.cr3_match;
1742                 break;
1743         case MSR_IA32_RTIT_OUTPUT_BASE:
1744                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1745                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1746                                         PT_CAP_topa_output) &&
1747                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1748                                         PT_CAP_single_range_output)))
1749                         return 1;
1750                 msr_info->data = vmx->pt_desc.guest.output_base;
1751                 break;
1752         case MSR_IA32_RTIT_OUTPUT_MASK:
1753                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1754                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1755                                         PT_CAP_topa_output) &&
1756                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1757                                         PT_CAP_single_range_output)))
1758                         return 1;
1759                 msr_info->data = vmx->pt_desc.guest.output_mask;
1760                 break;
1761         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1762                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
1763                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1764                         (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
1765                                         PT_CAP_num_address_ranges)))
1766                         return 1;
1767                 if (index % 2)
1768                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
1769                 else
1770                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
1771                 break;
1772         case MSR_TSC_AUX:
1773                 if (!msr_info->host_initiated &&
1774                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1775                         return 1;
1776                 /* Otherwise falls through */
1777         default:
1778                 msr = find_msr_entry(vmx, msr_info->index);
1779                 if (msr) {
1780                         msr_info->data = msr->data;
1781                         break;
1782                 }
1783                 return kvm_get_msr_common(vcpu, msr_info);
1784         }
1785
1786         return 0;
1787 }
1788
1789 /*
1790  * Writes msr value into into the appropriate "register".
1791  * Returns 0 on success, non-0 otherwise.
1792  * Assumes vcpu_load() was already called.
1793  */
1794 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1795 {
1796         struct vcpu_vmx *vmx = to_vmx(vcpu);
1797         struct shared_msr_entry *msr;
1798         int ret = 0;
1799         u32 msr_index = msr_info->index;
1800         u64 data = msr_info->data;
1801         u32 index;
1802
1803         switch (msr_index) {
1804         case MSR_EFER:
1805                 ret = kvm_set_msr_common(vcpu, msr_info);
1806                 break;
1807 #ifdef CONFIG_X86_64
1808         case MSR_FS_BASE:
1809                 vmx_segment_cache_clear(vmx);
1810                 vmcs_writel(GUEST_FS_BASE, data);
1811                 break;
1812         case MSR_GS_BASE:
1813                 vmx_segment_cache_clear(vmx);
1814                 vmcs_writel(GUEST_GS_BASE, data);
1815                 break;
1816         case MSR_KERNEL_GS_BASE:
1817                 vmx_write_guest_kernel_gs_base(vmx, data);
1818                 break;
1819 #endif
1820         case MSR_IA32_SYSENTER_CS:
1821                 vmcs_write32(GUEST_SYSENTER_CS, data);
1822                 break;
1823         case MSR_IA32_SYSENTER_EIP:
1824                 vmcs_writel(GUEST_SYSENTER_EIP, data);
1825                 break;
1826         case MSR_IA32_SYSENTER_ESP:
1827                 vmcs_writel(GUEST_SYSENTER_ESP, data);
1828                 break;
1829         case MSR_IA32_BNDCFGS:
1830                 if (!kvm_mpx_supported() ||
1831                     (!msr_info->host_initiated &&
1832                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1833                         return 1;
1834                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
1835                     (data & MSR_IA32_BNDCFGS_RSVD))
1836                         return 1;
1837                 vmcs_write64(GUEST_BNDCFGS, data);
1838                 break;
1839         case MSR_IA32_SPEC_CTRL:
1840                 if (!msr_info->host_initiated &&
1841                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1842                         return 1;
1843
1844                 /* The STIBP bit doesn't fault even if it's not advertised */
1845                 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
1846                         return 1;
1847
1848                 vmx->spec_ctrl = data;
1849
1850                 if (!data)
1851                         break;
1852
1853                 /*
1854                  * For non-nested:
1855                  * When it's written (to non-zero) for the first time, pass
1856                  * it through.
1857                  *
1858                  * For nested:
1859                  * The handling of the MSR bitmap for L2 guests is done in
1860                  * nested_vmx_merge_msr_bitmap. We should not touch the
1861                  * vmcs02.msr_bitmap here since it gets completely overwritten
1862                  * in the merging. We update the vmcs01 here for L1 as well
1863                  * since it will end up touching the MSR anyway now.
1864                  */
1865                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
1866                                               MSR_IA32_SPEC_CTRL,
1867                                               MSR_TYPE_RW);
1868                 break;
1869         case MSR_IA32_PRED_CMD:
1870                 if (!msr_info->host_initiated &&
1871                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1872                         return 1;
1873
1874                 if (data & ~PRED_CMD_IBPB)
1875                         return 1;
1876
1877                 if (!data)
1878                         break;
1879
1880                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
1881
1882                 /*
1883                  * For non-nested:
1884                  * When it's written (to non-zero) for the first time, pass
1885                  * it through.
1886                  *
1887                  * For nested:
1888                  * The handling of the MSR bitmap for L2 guests is done in
1889                  * nested_vmx_merge_msr_bitmap. We should not touch the
1890                  * vmcs02.msr_bitmap here since it gets completely overwritten
1891                  * in the merging.
1892                  */
1893                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
1894                                               MSR_TYPE_W);
1895                 break;
1896         case MSR_IA32_ARCH_CAPABILITIES:
1897                 if (!msr_info->host_initiated)
1898                         return 1;
1899                 vmx->arch_capabilities = data;
1900                 break;
1901         case MSR_IA32_CR_PAT:
1902                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1903                         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
1904                                 return 1;
1905                         vmcs_write64(GUEST_IA32_PAT, data);
1906                         vcpu->arch.pat = data;
1907                         break;
1908                 }
1909                 ret = kvm_set_msr_common(vcpu, msr_info);
1910                 break;
1911         case MSR_IA32_TSC_ADJUST:
1912                 ret = kvm_set_msr_common(vcpu, msr_info);
1913                 break;
1914         case MSR_IA32_MCG_EXT_CTL:
1915                 if ((!msr_info->host_initiated &&
1916                      !(to_vmx(vcpu)->msr_ia32_feature_control &
1917                        FEATURE_CONTROL_LMCE)) ||
1918                     (data & ~MCG_EXT_CTL_LMCE_EN))
1919                         return 1;
1920                 vcpu->arch.mcg_ext_ctl = data;
1921                 break;
1922         case MSR_IA32_FEATURE_CONTROL:
1923                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
1924                     (to_vmx(vcpu)->msr_ia32_feature_control &
1925                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
1926                         return 1;
1927                 vmx->msr_ia32_feature_control = data;
1928                 if (msr_info->host_initiated && data == 0)
1929                         vmx_leave_nested(vcpu);
1930                 break;
1931         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1932                 if (!msr_info->host_initiated)
1933                         return 1; /* they are read-only */
1934                 if (!nested_vmx_allowed(vcpu))
1935                         return 1;
1936                 return vmx_set_vmx_msr(vcpu, msr_index, data);
1937         case MSR_IA32_XSS:
1938                 if (!vmx_xsaves_supported())
1939                         return 1;
1940                 /*
1941                  * The only supported bit as of Skylake is bit 8, but
1942                  * it is not supported on KVM.
1943                  */
1944                 if (data != 0)
1945                         return 1;
1946                 vcpu->arch.ia32_xss = data;
1947                 if (vcpu->arch.ia32_xss != host_xss)
1948                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
1949                                 vcpu->arch.ia32_xss, host_xss, false);
1950                 else
1951                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
1952                 break;
1953         case MSR_IA32_RTIT_CTL:
1954                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1955                         vmx_rtit_ctl_check(vcpu, data) ||
1956                         vmx->nested.vmxon)
1957                         return 1;
1958                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
1959                 vmx->pt_desc.guest.ctl = data;
1960                 pt_update_intercept_for_msr(vmx);
1961                 break;
1962         case MSR_IA32_RTIT_STATUS:
1963                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1964                         (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1965                         (data & MSR_IA32_RTIT_STATUS_MASK))
1966                         return 1;
1967                 vmx->pt_desc.guest.status = data;
1968                 break;
1969         case MSR_IA32_RTIT_CR3_MATCH:
1970                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1971                         (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1972                         !intel_pt_validate_cap(vmx->pt_desc.caps,
1973                                                 PT_CAP_cr3_filtering))
1974                         return 1;
1975                 vmx->pt_desc.guest.cr3_match = data;
1976                 break;
1977         case MSR_IA32_RTIT_OUTPUT_BASE:
1978                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1979                         (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1980                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1981                                         PT_CAP_topa_output) &&
1982                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1983                                         PT_CAP_single_range_output)) ||
1984                         (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
1985                         return 1;
1986                 vmx->pt_desc.guest.output_base = data;
1987                 break;
1988         case MSR_IA32_RTIT_OUTPUT_MASK:
1989                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1990                         (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1991                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1992                                         PT_CAP_topa_output) &&
1993                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1994                                         PT_CAP_single_range_output)))
1995                         return 1;
1996                 vmx->pt_desc.guest.output_mask = data;
1997                 break;
1998         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1999                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2000                 if ((pt_mode != PT_MODE_HOST_GUEST) ||
2001                         (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2002                         (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2003                                         PT_CAP_num_address_ranges)))
2004                         return 1;
2005                 if (index % 2)
2006                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2007                 else
2008                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2009                 break;
2010         case MSR_TSC_AUX:
2011                 if (!msr_info->host_initiated &&
2012                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2013                         return 1;
2014                 /* Check reserved bit, higher 32 bits should be zero */
2015                 if ((data >> 32) != 0)
2016                         return 1;
2017                 /* Otherwise falls through */
2018         default:
2019                 msr = find_msr_entry(vmx, msr_index);
2020                 if (msr) {
2021                         u64 old_msr_data = msr->data;
2022                         msr->data = data;
2023                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2024                                 preempt_disable();
2025                                 ret = kvm_set_shared_msr(msr->index, msr->data,
2026                                                          msr->mask);
2027                                 preempt_enable();
2028                                 if (ret)
2029                                         msr->data = old_msr_data;
2030                         }
2031                         break;
2032                 }
2033                 ret = kvm_set_msr_common(vcpu, msr_info);
2034         }
2035
2036         return ret;
2037 }
2038
2039 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2040 {
2041         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2042         switch (reg) {
2043         case VCPU_REGS_RSP:
2044                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2045                 break;
2046         case VCPU_REGS_RIP:
2047                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2048                 break;
2049         case VCPU_EXREG_PDPTR:
2050                 if (enable_ept)
2051                         ept_save_pdptrs(vcpu);
2052                 break;
2053         default:
2054                 break;
2055         }
2056 }
2057
2058 static __init int cpu_has_kvm_support(void)
2059 {
2060         return cpu_has_vmx();
2061 }
2062
2063 static __init int vmx_disabled_by_bios(void)
2064 {
2065         u64 msr;
2066
2067         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2068         if (msr & FEATURE_CONTROL_LOCKED) {
2069                 /* launched w/ TXT and VMX disabled */
2070                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2071                         && tboot_enabled())
2072                         return 1;
2073                 /* launched w/o TXT and VMX only enabled w/ TXT */
2074                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2075                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2076                         && !tboot_enabled()) {
2077                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2078                                 "activate TXT before enabling KVM\n");
2079                         return 1;
2080                 }
2081                 /* launched w/o TXT and VMX disabled */
2082                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2083                         && !tboot_enabled())
2084                         return 1;
2085         }
2086
2087         return 0;
2088 }
2089
2090 static void kvm_cpu_vmxon(u64 addr)
2091 {
2092         cr4_set_bits(X86_CR4_VMXE);
2093         intel_pt_handle_vmx(1);
2094
2095         asm volatile ("vmxon %0" : : "m"(addr));
2096 }
2097
2098 static int hardware_enable(void)
2099 {
2100         int cpu = raw_smp_processor_id();
2101         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2102         u64 old, test_bits;
2103
2104         if (cr4_read_shadow() & X86_CR4_VMXE)
2105                 return -EBUSY;
2106
2107         /*
2108          * This can happen if we hot-added a CPU but failed to allocate
2109          * VP assist page for it.
2110          */
2111         if (static_branch_unlikely(&enable_evmcs) &&
2112             !hv_get_vp_assist_page(cpu))
2113                 return -EFAULT;
2114
2115         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2116         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
2117         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
2118
2119         /*
2120          * Now we can enable the vmclear operation in kdump
2121          * since the loaded_vmcss_on_cpu list on this cpu
2122          * has been initialized.
2123          *
2124          * Though the cpu is not in VMX operation now, there
2125          * is no problem to enable the vmclear operation
2126          * for the loaded_vmcss_on_cpu list is empty!
2127          */
2128         crash_enable_local_vmclear(cpu);
2129
2130         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2131
2132         test_bits = FEATURE_CONTROL_LOCKED;
2133         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2134         if (tboot_enabled())
2135                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2136
2137         if ((old & test_bits) != test_bits) {
2138                 /* enable and lock */
2139                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2140         }
2141         kvm_cpu_vmxon(phys_addr);
2142         if (enable_ept)
2143                 ept_sync_global();
2144
2145         return 0;
2146 }
2147
2148 static void vmclear_local_loaded_vmcss(void)
2149 {
2150         int cpu = raw_smp_processor_id();
2151         struct loaded_vmcs *v, *n;
2152
2153         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2154                                  loaded_vmcss_on_cpu_link)
2155                 __loaded_vmcs_clear(v);
2156 }
2157
2158
2159 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2160  * tricks.
2161  */
2162 static void kvm_cpu_vmxoff(void)
2163 {
2164         asm volatile (__ex("vmxoff"));
2165
2166         intel_pt_handle_vmx(0);
2167         cr4_clear_bits(X86_CR4_VMXE);
2168 }
2169
2170 static void hardware_disable(void)
2171 {
2172         vmclear_local_loaded_vmcss();
2173         kvm_cpu_vmxoff();
2174 }
2175
2176 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2177                                       u32 msr, u32 *result)
2178 {
2179         u32 vmx_msr_low, vmx_msr_high;
2180         u32 ctl = ctl_min | ctl_opt;
2181
2182         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2183
2184         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2185         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2186
2187         /* Ensure minimum (required) set of control bits are supported. */
2188         if (ctl_min & ~ctl)
2189                 return -EIO;
2190
2191         *result = ctl;
2192         return 0;
2193 }
2194
2195 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2196                                     struct vmx_capability *vmx_cap)
2197 {
2198         u32 vmx_msr_low, vmx_msr_high;
2199         u32 min, opt, min2, opt2;
2200         u32 _pin_based_exec_control = 0;
2201         u32 _cpu_based_exec_control = 0;
2202         u32 _cpu_based_2nd_exec_control = 0;
2203         u32 _vmexit_control = 0;
2204         u32 _vmentry_control = 0;
2205
2206         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2207         min = CPU_BASED_HLT_EXITING |
2208 #ifdef CONFIG_X86_64
2209               CPU_BASED_CR8_LOAD_EXITING |
2210               CPU_BASED_CR8_STORE_EXITING |
2211 #endif
2212               CPU_BASED_CR3_LOAD_EXITING |
2213               CPU_BASED_CR3_STORE_EXITING |
2214               CPU_BASED_UNCOND_IO_EXITING |
2215               CPU_BASED_MOV_DR_EXITING |
2216               CPU_BASED_USE_TSC_OFFSETING |
2217               CPU_BASED_MWAIT_EXITING |
2218               CPU_BASED_MONITOR_EXITING |
2219               CPU_BASED_INVLPG_EXITING |
2220               CPU_BASED_RDPMC_EXITING;
2221
2222         opt = CPU_BASED_TPR_SHADOW |
2223               CPU_BASED_USE_MSR_BITMAPS |
2224               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2225         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2226                                 &_cpu_based_exec_control) < 0)
2227                 return -EIO;
2228 #ifdef CONFIG_X86_64
2229         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2230                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2231                                            ~CPU_BASED_CR8_STORE_EXITING;
2232 #endif
2233         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2234                 min2 = 0;
2235                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2236                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2237                         SECONDARY_EXEC_WBINVD_EXITING |
2238                         SECONDARY_EXEC_ENABLE_VPID |
2239                         SECONDARY_EXEC_ENABLE_EPT |
2240                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
2241                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2242                         SECONDARY_EXEC_DESC |
2243                         SECONDARY_EXEC_RDTSCP |
2244                         SECONDARY_EXEC_ENABLE_INVPCID |
2245                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
2246                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2247                         SECONDARY_EXEC_SHADOW_VMCS |
2248                         SECONDARY_EXEC_XSAVES |
2249                         SECONDARY_EXEC_RDSEED_EXITING |
2250                         SECONDARY_EXEC_RDRAND_EXITING |
2251                         SECONDARY_EXEC_ENABLE_PML |
2252                         SECONDARY_EXEC_TSC_SCALING |
2253                         SECONDARY_EXEC_PT_USE_GPA |
2254                         SECONDARY_EXEC_PT_CONCEAL_VMX |
2255                         SECONDARY_EXEC_ENABLE_VMFUNC |
2256                         SECONDARY_EXEC_ENCLS_EXITING;
2257                 if (adjust_vmx_controls(min2, opt2,
2258                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2259                                         &_cpu_based_2nd_exec_control) < 0)
2260                         return -EIO;
2261         }
2262 #ifndef CONFIG_X86_64
2263         if (!(_cpu_based_2nd_exec_control &
2264                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2265                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2266 #endif
2267
2268         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2269                 _cpu_based_2nd_exec_control &= ~(
2270                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2271                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2272                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2273
2274         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2275                 &vmx_cap->ept, &vmx_cap->vpid);
2276
2277         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2278                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2279                    enabled */
2280                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2281                                              CPU_BASED_CR3_STORE_EXITING |
2282                                              CPU_BASED_INVLPG_EXITING);
2283         } else if (vmx_cap->ept) {
2284                 vmx_cap->ept = 0;
2285                 pr_warn_once("EPT CAP should not exist if not support "
2286                                 "1-setting enable EPT VM-execution control\n");
2287         }
2288         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2289                 vmx_cap->vpid) {
2290                 vmx_cap->vpid = 0;
2291                 pr_warn_once("VPID CAP should not exist if not support "
2292                                 "1-setting enable VPID VM-execution control\n");
2293         }
2294
2295         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2296 #ifdef CONFIG_X86_64
2297         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2298 #endif
2299         opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2300               VM_EXIT_SAVE_IA32_PAT |
2301               VM_EXIT_LOAD_IA32_PAT |
2302               VM_EXIT_LOAD_IA32_EFER |
2303               VM_EXIT_CLEAR_BNDCFGS |
2304               VM_EXIT_PT_CONCEAL_PIP |
2305               VM_EXIT_CLEAR_IA32_RTIT_CTL;
2306         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2307                                 &_vmexit_control) < 0)
2308                 return -EIO;
2309
2310         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2311         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2312                  PIN_BASED_VMX_PREEMPTION_TIMER;
2313         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2314                                 &_pin_based_exec_control) < 0)
2315                 return -EIO;
2316
2317         if (cpu_has_broken_vmx_preemption_timer())
2318                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2319         if (!(_cpu_based_2nd_exec_control &
2320                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2321                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2322
2323         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2324         opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2325               VM_ENTRY_LOAD_IA32_PAT |
2326               VM_ENTRY_LOAD_IA32_EFER |
2327               VM_ENTRY_LOAD_BNDCFGS |
2328               VM_ENTRY_PT_CONCEAL_PIP |
2329               VM_ENTRY_LOAD_IA32_RTIT_CTL;
2330         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2331                                 &_vmentry_control) < 0)
2332                 return -EIO;
2333
2334         /*
2335          * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2336          * can't be used due to an errata where VM Exit may incorrectly clear
2337          * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
2338          * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2339          */
2340         if (boot_cpu_data.x86 == 0x6) {
2341                 switch (boot_cpu_data.x86_model) {
2342                 case 26: /* AAK155 */
2343                 case 30: /* AAP115 */
2344                 case 37: /* AAT100 */
2345                 case 44: /* BC86,AAY89,BD102 */
2346                 case 46: /* BA97 */
2347                         _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2348                         _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2349                         pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2350                                         "does not work properly. Using workaround\n");
2351                         break;
2352                 default:
2353                         break;
2354                 }
2355         }
2356
2357
2358         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2359
2360         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2361         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2362                 return -EIO;
2363
2364 #ifdef CONFIG_X86_64
2365         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2366         if (vmx_msr_high & (1u<<16))
2367                 return -EIO;
2368 #endif
2369
2370         /* Require Write-Back (WB) memory type for VMCS accesses. */
2371         if (((vmx_msr_high >> 18) & 15) != 6)
2372                 return -EIO;
2373
2374         vmcs_conf->size = vmx_msr_high & 0x1fff;
2375         vmcs_conf->order = get_order(vmcs_conf->size);
2376         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2377
2378         vmcs_conf->revision_id = vmx_msr_low;
2379
2380         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2381         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2382         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2383         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2384         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2385
2386         if (static_branch_unlikely(&enable_evmcs))
2387                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2388
2389         return 0;
2390 }
2391
2392 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
2393 {
2394         int node = cpu_to_node(cpu);
2395         struct page *pages;
2396         struct vmcs *vmcs;
2397
2398         pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
2399         if (!pages)
2400                 return NULL;
2401         vmcs = page_address(pages);
2402         memset(vmcs, 0, vmcs_config.size);
2403
2404         /* KVM supports Enlightened VMCS v1 only */
2405         if (static_branch_unlikely(&enable_evmcs))
2406                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2407         else
2408                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2409
2410         if (shadow)
2411                 vmcs->hdr.shadow_vmcs = 1;
2412         return vmcs;
2413 }
2414
2415 void free_vmcs(struct vmcs *vmcs)
2416 {
2417         free_pages((unsigned long)vmcs, vmcs_config.order);
2418 }
2419
2420 /*
2421  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2422  */
2423 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2424 {
2425         if (!loaded_vmcs->vmcs)
2426                 return;
2427         loaded_vmcs_clear(loaded_vmcs);
2428         free_vmcs(loaded_vmcs->vmcs);
2429         loaded_vmcs->vmcs = NULL;
2430         if (loaded_vmcs->msr_bitmap)
2431                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2432         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2433 }
2434
2435 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2436 {
2437         loaded_vmcs->vmcs = alloc_vmcs(false);
2438         if (!loaded_vmcs->vmcs)
2439                 return -ENOMEM;
2440
2441         loaded_vmcs->shadow_vmcs = NULL;
2442         loaded_vmcs_init(loaded_vmcs);
2443
2444         if (cpu_has_vmx_msr_bitmap()) {
2445                 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
2446                 if (!loaded_vmcs->msr_bitmap)
2447                         goto out_vmcs;
2448                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2449
2450                 if (IS_ENABLED(CONFIG_HYPERV) &&
2451                     static_branch_unlikely(&enable_evmcs) &&
2452                     (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
2453                         struct hv_enlightened_vmcs *evmcs =
2454                                 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
2455
2456                         evmcs->hv_enlightenments_control.msr_bitmap = 1;
2457                 }
2458         }
2459
2460         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2461
2462         return 0;
2463
2464 out_vmcs:
2465         free_loaded_vmcs(loaded_vmcs);
2466         return -ENOMEM;
2467 }
2468
2469 static void free_kvm_area(void)
2470 {
2471         int cpu;
2472
2473         for_each_possible_cpu(cpu) {
2474                 free_vmcs(per_cpu(vmxarea, cpu));
2475                 per_cpu(vmxarea, cpu) = NULL;
2476         }
2477 }
2478
2479 static __init int alloc_kvm_area(void)
2480 {
2481         int cpu;
2482
2483         for_each_possible_cpu(cpu) {
2484                 struct vmcs *vmcs;
2485
2486                 vmcs = alloc_vmcs_cpu(false, cpu);
2487                 if (!vmcs) {
2488                         free_kvm_area();
2489                         return -ENOMEM;
2490                 }
2491
2492                 /*
2493                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2494                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2495                  * revision_id reported by MSR_IA32_VMX_BASIC.
2496                  *
2497                  * However, even though not explicitly documented by
2498                  * TLFS, VMXArea passed as VMXON argument should
2499                  * still be marked with revision_id reported by
2500                  * physical CPU.
2501                  */
2502                 if (static_branch_unlikely(&enable_evmcs))
2503                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2504
2505                 per_cpu(vmxarea, cpu) = vmcs;
2506         }
2507         return 0;
2508 }
2509
2510 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2511                 struct kvm_segment *save)
2512 {
2513         if (!emulate_invalid_guest_state) {
2514                 /*
2515                  * CS and SS RPL should be equal during guest entry according
2516                  * to VMX spec, but in reality it is not always so. Since vcpu
2517                  * is in the middle of the transition from real mode to
2518                  * protected mode it is safe to assume that RPL 0 is a good
2519                  * default value.
2520                  */
2521                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2522                         save->selector &= ~SEGMENT_RPL_MASK;
2523                 save->dpl = save->selector & SEGMENT_RPL_MASK;
2524                 save->s = 1;
2525         }
2526         vmx_set_segment(vcpu, save, seg);
2527 }
2528
2529 static void enter_pmode(struct kvm_vcpu *vcpu)
2530 {
2531         unsigned long flags;
2532         struct vcpu_vmx *vmx = to_vmx(vcpu);
2533
2534         /*
2535          * Update real mode segment cache. It may be not up-to-date if sement
2536          * register was written while vcpu was in a guest mode.
2537          */
2538         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2539         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2540         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2541         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2542         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2543         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2544
2545         vmx->rmode.vm86_active = 0;
2546
2547         vmx_segment_cache_clear(vmx);
2548
2549         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2550
2551         flags = vmcs_readl(GUEST_RFLAGS);
2552         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2553         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2554         vmcs_writel(GUEST_RFLAGS, flags);
2555
2556         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2557                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2558
2559         update_exception_bitmap(vcpu);
2560
2561         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2562         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2563         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2564         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2565         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2566         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2567 }
2568
2569 static void fix_rmode_seg(int seg, struct kvm_segment *save)
2570 {
2571         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2572         struct kvm_segment var = *save;
2573
2574         var.dpl = 0x3;
2575         if (seg == VCPU_SREG_CS)
2576                 var.type = 0x3;
2577
2578         if (!emulate_invalid_guest_state) {
2579                 var.selector = var.base >> 4;
2580                 var.base = var.base & 0xffff0;
2581                 var.limit = 0xffff;
2582                 var.g = 0;
2583                 var.db = 0;
2584                 var.present = 1;
2585                 var.s = 1;
2586                 var.l = 0;
2587                 var.unusable = 0;
2588                 var.type = 0x3;
2589                 var.avl = 0;
2590                 if (save->base & 0xf)
2591                         printk_once(KERN_WARNING "kvm: segment base is not "
2592                                         "paragraph aligned when entering "
2593                                         "protected mode (seg=%d)", seg);
2594         }
2595
2596         vmcs_write16(sf->selector, var.selector);
2597         vmcs_writel(sf->base, var.base);
2598         vmcs_write32(sf->limit, var.limit);
2599         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2600 }
2601
2602 static void enter_rmode(struct kvm_vcpu *vcpu)
2603 {
2604         unsigned long flags;
2605         struct vcpu_vmx *vmx = to_vmx(vcpu);
2606         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
2607
2608         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2609         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2610         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2611         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2612         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2613         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2614         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2615
2616         vmx->rmode.vm86_active = 1;
2617
2618         /*
2619          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2620          * vcpu. Warn the user that an update is overdue.
2621          */
2622         if (!kvm_vmx->tss_addr)
2623                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2624                              "called before entering vcpu\n");
2625
2626         vmx_segment_cache_clear(vmx);
2627
2628         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
2629         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2630         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2631
2632         flags = vmcs_readl(GUEST_RFLAGS);
2633         vmx->rmode.save_rflags = flags;
2634
2635         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2636
2637         vmcs_writel(GUEST_RFLAGS, flags);
2638         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2639         update_exception_bitmap(vcpu);
2640
2641         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2642         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2643         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2644         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2645         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2646         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2647
2648         kvm_mmu_reset_context(vcpu);
2649 }
2650
2651 void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2652 {
2653         struct vcpu_vmx *vmx = to_vmx(vcpu);
2654         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
2655
2656         if (!msr)
2657                 return;
2658
2659         vcpu->arch.efer = efer;
2660         if (efer & EFER_LMA) {
2661                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2662                 msr->data = efer;
2663         } else {
2664                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2665
2666                 msr->data = efer & ~EFER_LME;
2667         }
2668         setup_msrs(vmx);
2669 }
2670
2671 #ifdef CONFIG_X86_64
2672
2673 static void enter_lmode(struct kvm_vcpu *vcpu)
2674 {
2675         u32 guest_tr_ar;
2676
2677         vmx_segment_cache_clear(to_vmx(vcpu));
2678
2679         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2680         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2681                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2682                                      __func__);
2683                 vmcs_write32(GUEST_TR_AR_BYTES,
2684                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2685                              | VMX_AR_TYPE_BUSY_64_TSS);
2686         }
2687         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2688 }
2689
2690 static void exit_lmode(struct kvm_vcpu *vcpu)
2691 {
2692         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2693         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2694 }
2695
2696 #endif
2697
2698 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
2699 {
2700         int vpid = to_vmx(vcpu)->vpid;
2701
2702         if (!vpid_sync_vcpu_addr(vpid, addr))
2703                 vpid_sync_context(vpid);
2704
2705         /*
2706          * If VPIDs are not supported or enabled, then the above is a no-op.
2707          * But we don't really need a TLB flush in that case anyway, because
2708          * each VM entry/exit includes an implicit flush when VPID is 0.
2709          */
2710 }
2711
2712 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2713 {
2714         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2715
2716         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
2717         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
2718 }
2719
2720 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
2721 {
2722         if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
2723                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2724         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
2725 }
2726
2727 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
2728 {
2729         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2730
2731         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
2732         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
2733 }
2734
2735 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
2736 {
2737         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2738
2739         if (!test_bit(VCPU_EXREG_PDPTR,
2740                       (unsigned long *)&vcpu->arch.regs_dirty))
2741                 return;
2742
2743         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2744                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
2745                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
2746                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
2747                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
2748         }
2749 }
2750
2751 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2752 {
2753         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2754
2755         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2756                 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
2757                 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
2758                 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
2759                 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
2760         }
2761
2762         __set_bit(VCPU_EXREG_PDPTR,
2763                   (unsigned long *)&vcpu->arch.regs_avail);
2764         __set_bit(VCPU_EXREG_PDPTR,
2765                   (unsigned long *)&vcpu->arch.regs_dirty);
2766 }
2767
2768 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2769                                         unsigned long cr0,
2770                                         struct kvm_vcpu *vcpu)
2771 {
2772         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
2773                 vmx_decache_cr3(vcpu);
2774         if (!(cr0 & X86_CR0_PG)) {
2775                 /* From paging/starting to nonpaging */
2776                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2777                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
2778                              (CPU_BASED_CR3_LOAD_EXITING |
2779                               CPU_BASED_CR3_STORE_EXITING));
2780                 vcpu->arch.cr0 = cr0;
2781                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2782         } else if (!is_paging(vcpu)) {
2783                 /* From nonpaging to paging */
2784                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2785                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
2786                              ~(CPU_BASED_CR3_LOAD_EXITING |
2787                                CPU_BASED_CR3_STORE_EXITING));
2788                 vcpu->arch.cr0 = cr0;
2789                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2790         }
2791
2792         if (!(cr0 & X86_CR0_WP))
2793                 *hw_cr0 &= ~X86_CR0_WP;
2794 }
2795
2796 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2797 {
2798         struct vcpu_vmx *vmx = to_vmx(vcpu);
2799         unsigned long hw_cr0;
2800
2801         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
2802         if (enable_unrestricted_guest)
2803                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
2804         else {
2805                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
2806
2807                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
2808                         enter_pmode(vcpu);
2809
2810                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
2811                         enter_rmode(vcpu);
2812         }
2813
2814 #ifdef CONFIG_X86_64
2815         if (vcpu->arch.efer & EFER_LME) {
2816                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
2817                         enter_lmode(vcpu);
2818                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
2819                         exit_lmode(vcpu);
2820         }
2821 #endif
2822
2823         if (enable_ept && !enable_unrestricted_guest)
2824                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
2825
2826         vmcs_writel(CR0_READ_SHADOW, cr0);
2827         vmcs_writel(GUEST_CR0, hw_cr0);
2828         vcpu->arch.cr0 = cr0;
2829
2830         /* depends on vcpu->arch.cr0 to be set to a new value */
2831         vmx->emulation_required = emulation_required(vcpu);
2832 }
2833
2834 static int get_ept_level(struct kvm_vcpu *vcpu)
2835 {
2836         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
2837                 return 5;
2838         return 4;
2839 }
2840
2841 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
2842 {
2843         u64 eptp = VMX_EPTP_MT_WB;
2844
2845         eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
2846
2847         if (enable_ept_ad_bits &&
2848             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
2849                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
2850         eptp |= (root_hpa & PAGE_MASK);
2851
2852         return eptp;
2853 }
2854
2855 void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2856 {
2857         struct kvm *kvm = vcpu->kvm;
2858         unsigned long guest_cr3;
2859         u64 eptp;
2860
2861         guest_cr3 = cr3;
2862         if (enable_ept) {
2863                 eptp = construct_eptp(vcpu, cr3);
2864                 vmcs_write64(EPT_POINTER, eptp);
2865
2866                 if (kvm_x86_ops->tlb_remote_flush) {
2867                         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
2868                         to_vmx(vcpu)->ept_pointer = eptp;
2869                         to_kvm_vmx(kvm)->ept_pointers_match
2870                                 = EPT_POINTERS_CHECK;
2871                         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
2872                 }
2873
2874                 if (enable_unrestricted_guest || is_paging(vcpu) ||
2875                     is_guest_mode(vcpu))
2876                         guest_cr3 = kvm_read_cr3(vcpu);
2877                 else
2878                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
2879                 ept_load_pdptrs(vcpu);
2880         }
2881
2882         vmcs_writel(GUEST_CR3, guest_cr3);
2883 }
2884
2885 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2886 {
2887         /*
2888          * Pass through host's Machine Check Enable value to hw_cr4, which
2889          * is in force while we are in guest mode.  Do not let guests control
2890          * this bit, even if host CR4.MCE == 0.
2891          */
2892         unsigned long hw_cr4;
2893
2894         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
2895         if (enable_unrestricted_guest)
2896                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
2897         else if (to_vmx(vcpu)->rmode.vm86_active)
2898                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
2899         else
2900                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
2901
2902         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
2903                 if (cr4 & X86_CR4_UMIP) {
2904                         vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
2905                                 SECONDARY_EXEC_DESC);
2906                         hw_cr4 &= ~X86_CR4_UMIP;
2907                 } else if (!is_guest_mode(vcpu) ||
2908                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
2909                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
2910                                         SECONDARY_EXEC_DESC);
2911         }
2912
2913         if (cr4 & X86_CR4_VMXE) {
2914                 /*
2915                  * To use VMXON (and later other VMX instructions), a guest
2916                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
2917                  * So basically the check on whether to allow nested VMX
2918                  * is here.  We operate under the default treatment of SMM,
2919                  * so VMX cannot be enabled under SMM.
2920                  */
2921                 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
2922                         return 1;
2923         }
2924
2925         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
2926                 return 1;
2927
2928         vcpu->arch.cr4 = cr4;
2929
2930         if (!enable_unrestricted_guest) {
2931                 if (enable_ept) {
2932                         if (!is_paging(vcpu)) {
2933                                 hw_cr4 &= ~X86_CR4_PAE;
2934                                 hw_cr4 |= X86_CR4_PSE;
2935                         } else if (!(cr4 & X86_CR4_PAE)) {
2936                                 hw_cr4 &= ~X86_CR4_PAE;
2937                         }
2938                 }
2939
2940                 /*
2941                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
2942                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
2943                  * to be manually disabled when guest switches to non-paging
2944                  * mode.
2945                  *
2946                  * If !enable_unrestricted_guest, the CPU is always running
2947                  * with CR0.PG=1 and CR4 needs to be modified.
2948                  * If enable_unrestricted_guest, the CPU automatically
2949                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
2950                  */
2951                 if (!is_paging(vcpu))
2952                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
2953         }
2954
2955         vmcs_writel(CR4_READ_SHADOW, cr4);
2956         vmcs_writel(GUEST_CR4, hw_cr4);
2957         return 0;
2958 }
2959
2960 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
2961 {
2962         struct vcpu_vmx *vmx = to_vmx(vcpu);
2963         u32 ar;
2964
2965         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
2966                 *var = vmx->rmode.segs[seg];
2967                 if (seg == VCPU_SREG_TR
2968                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2969                         return;
2970                 var->base = vmx_read_guest_seg_base(vmx, seg);
2971                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2972                 return;
2973         }
2974         var->base = vmx_read_guest_seg_base(vmx, seg);
2975         var->limit = vmx_read_guest_seg_limit(vmx, seg);
2976         var->selector = vmx_read_guest_seg_selector(vmx, seg);
2977         ar = vmx_read_guest_seg_ar(vmx, seg);
2978         var->unusable = (ar >> 16) & 1;
2979         var->type = ar & 15;
2980         var->s = (ar >> 4) & 1;
2981         var->dpl = (ar >> 5) & 3;
2982         /*
2983          * Some userspaces do not preserve unusable property. Since usable
2984          * segment has to be present according to VMX spec we can use present
2985          * property to amend userspace bug by making unusable segment always
2986          * nonpresent. vmx_segment_access_rights() already marks nonpresent
2987          * segment as unusable.
2988          */
2989         var->present = !var->unusable;
2990         var->avl = (ar >> 12) & 1;
2991         var->l = (ar >> 13) & 1;
2992         var->db = (ar >> 14) & 1;
2993         var->g = (ar >> 15) & 1;
2994 }
2995
2996 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2997 {
2998         struct kvm_segment s;
2999
3000         if (to_vmx(vcpu)->rmode.vm86_active) {
3001                 vmx_get_segment(vcpu, &s, seg);
3002                 return s.base;
3003         }
3004         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3005 }
3006
3007 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3008 {
3009         struct vcpu_vmx *vmx = to_vmx(vcpu);
3010
3011         if (unlikely(vmx->rmode.vm86_active))
3012                 return 0;
3013         else {
3014                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3015                 return VMX_AR_DPL(ar);
3016         }
3017 }
3018
3019 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3020 {
3021         u32 ar;
3022
3023         if (var->unusable || !var->present)
3024                 ar = 1 << 16;
3025         else {
3026                 ar = var->type & 15;
3027                 ar |= (var->s & 1) << 4;
3028                 ar |= (var->dpl & 3) << 5;
3029                 ar |= (var->present & 1) << 7;
3030                 ar |= (var->avl & 1) << 12;
3031                 ar |= (var->l & 1) << 13;
3032                 ar |= (var->db & 1) << 14;
3033                 ar |= (var->g & 1) << 15;
3034         }
3035
3036         return ar;
3037 }
3038
3039 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3040 {
3041         struct vcpu_vmx *vmx = to_vmx(vcpu);
3042         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3043
3044         vmx_segment_cache_clear(vmx);
3045
3046         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3047                 vmx->rmode.segs[seg] = *var;
3048                 if (seg == VCPU_SREG_TR)
3049                         vmcs_write16(sf->selector, var->selector);
3050                 else if (var->s)
3051                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3052                 goto out;
3053         }
3054
3055         vmcs_writel(sf->base, var->base);
3056         vmcs_write32(sf->limit, var->limit);
3057         vmcs_write16(sf->selector, var->selector);
3058
3059         /*
3060          *   Fix the "Accessed" bit in AR field of segment registers for older
3061          * qemu binaries.
3062          *   IA32 arch specifies that at the time of processor reset the
3063          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3064          * is setting it to 0 in the userland code. This causes invalid guest
3065          * state vmexit when "unrestricted guest" mode is turned on.
3066          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3067          * tree. Newer qemu binaries with that qemu fix would not need this
3068          * kvm hack.
3069          */
3070         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3071                 var->type |= 0x1; /* Accessed */
3072
3073         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3074
3075 out:
3076         vmx->emulation_required = emulation_required(vcpu);
3077 }
3078
3079 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080 {
3081         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3082
3083         *db = (ar >> 14) & 1;
3084         *l = (ar >> 13) & 1;
3085 }
3086
3087 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3088 {
3089         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3090         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3091 }
3092
3093 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3094 {
3095         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3096         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3097 }
3098
3099 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3100 {
3101         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3102         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3103 }
3104
3105 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3106 {
3107         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3108         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3109 }
3110
3111 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3112 {
3113         struct kvm_segment var;
3114         u32 ar;
3115
3116         vmx_get_segment(vcpu, &var, seg);
3117         var.dpl = 0x3;
3118         if (seg == VCPU_SREG_CS)
3119                 var.type = 0x3;
3120         ar = vmx_segment_access_rights(&var);
3121
3122         if (var.base != (var.selector << 4))
3123                 return false;
3124         if (var.limit != 0xffff)
3125                 return false;
3126         if (ar != 0xf3)
3127                 return false;
3128
3129         return true;
3130 }
3131
3132 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3133 {
3134         struct kvm_segment cs;
3135         unsigned int cs_rpl;
3136
3137         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3138         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3139
3140         if (cs.unusable)
3141                 return false;
3142         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3143                 return false;
3144         if (!cs.s)
3145                 return false;
3146         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3147                 if (cs.dpl > cs_rpl)
3148                         return false;
3149         } else {
3150                 if (cs.dpl != cs_rpl)
3151                         return false;
3152         }
3153         if (!cs.present)
3154                 return false;
3155
3156         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3157         return true;
3158 }
3159
3160 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3161 {
3162         struct kvm_segment ss;
3163         unsigned int ss_rpl;
3164
3165         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3166         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3167
3168         if (ss.unusable)
3169                 return true;
3170         if (ss.type != 3 && ss.type != 7)
3171                 return false;
3172         if (!ss.s)
3173                 return false;
3174         if (ss.dpl != ss_rpl) /* DPL != RPL */
3175                 return false;
3176         if (!ss.present)
3177                 return false;
3178
3179         return true;
3180 }
3181
3182 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3183 {
3184         struct kvm_segment var;
3185         unsigned int rpl;
3186
3187         vmx_get_segment(vcpu, &var, seg);
3188         rpl = var.selector & SEGMENT_RPL_MASK;
3189
3190         if (var.unusable)
3191                 return true;
3192         if (!var.s)
3193                 return false;
3194         if (!var.present)
3195                 return false;
3196         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3197                 if (var.dpl < rpl) /* DPL < RPL */
3198                         return false;
3199         }
3200
3201         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3202          * rights flags
3203          */
3204         return true;
3205 }
3206
3207 static bool tr_valid(struct kvm_vcpu *vcpu)
3208 {
3209         struct kvm_segment tr;
3210
3211         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3212
3213         if (tr.unusable)
3214                 return false;
3215         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3216                 return false;
3217         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3218                 return false;
3219         if (!tr.present)
3220                 return false;
3221
3222         return true;
3223 }
3224
3225 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3226 {
3227         struct kvm_segment ldtr;
3228
3229         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3230
3231         if (ldtr.unusable)
3232                 return true;
3233         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3234                 return false;
3235         if (ldtr.type != 2)
3236                 return false;
3237         if (!ldtr.present)
3238                 return false;
3239
3240         return true;
3241 }
3242
3243 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3244 {
3245         struct kvm_segment cs, ss;
3246
3247         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3248         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3249
3250         return ((cs.selector & SEGMENT_RPL_MASK) ==
3251                  (ss.selector & SEGMENT_RPL_MASK));
3252 }
3253
3254 /*
3255  * Check if guest state is valid. Returns true if valid, false if
3256  * not.
3257  * We assume that registers are always usable
3258  */
3259 static bool guest_state_valid(struct kvm_vcpu *vcpu)
3260 {
3261         if (enable_unrestricted_guest)
3262