9973a301364e0e0c8aa7d8be23f0c0d8e69ac3c7
[sfrench/cifs-2.6.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "cpuid.h"
22 #include "lapic.h"
23
24 #include <linux/kvm_host.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/mm.h>
28 #include <linux/highmem.h>
29 #include <linux/sched.h>
30 #include <linux/moduleparam.h>
31 #include <linux/mod_devicetable.h>
32 #include <linux/trace_events.h>
33 #include <linux/slab.h>
34 #include <linux/tboot.h>
35 #include <linux/hrtimer.h>
36 #include <linux/frame.h>
37 #include "kvm_cache_regs.h"
38 #include "x86.h"
39
40 #include <asm/cpu.h>
41 #include <asm/io.h>
42 #include <asm/desc.h>
43 #include <asm/vmx.h>
44 #include <asm/virtext.h>
45 #include <asm/mce.h>
46 #include <asm/fpu/internal.h>
47 #include <asm/perf_event.h>
48 #include <asm/debugreg.h>
49 #include <asm/kexec.h>
50 #include <asm/apic.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/mmu_context.h>
53 #include <asm/nospec-branch.h>
54
55 #include "trace.h"
56 #include "pmu.h"
57
58 #define __ex(x) __kvm_handle_fault_on_reboot(x)
59 #define __ex_clear(x, reg) \
60         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
61
62 MODULE_AUTHOR("Qumranet");
63 MODULE_LICENSE("GPL");
64
65 static const struct x86_cpu_id vmx_cpu_id[] = {
66         X86_FEATURE_MATCH(X86_FEATURE_VMX),
67         {}
68 };
69 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
70
71 static bool __read_mostly enable_vpid = 1;
72 module_param_named(vpid, enable_vpid, bool, 0444);
73
74 static bool __read_mostly enable_vnmi = 1;
75 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
76
77 static bool __read_mostly flexpriority_enabled = 1;
78 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
79
80 static bool __read_mostly enable_ept = 1;
81 module_param_named(ept, enable_ept, bool, S_IRUGO);
82
83 static bool __read_mostly enable_unrestricted_guest = 1;
84 module_param_named(unrestricted_guest,
85                         enable_unrestricted_guest, bool, S_IRUGO);
86
87 static bool __read_mostly enable_ept_ad_bits = 1;
88 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
89
90 static bool __read_mostly emulate_invalid_guest_state = true;
91 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
92
93 static bool __read_mostly fasteoi = 1;
94 module_param(fasteoi, bool, S_IRUGO);
95
96 static bool __read_mostly enable_apicv = 1;
97 module_param(enable_apicv, bool, S_IRUGO);
98
99 static bool __read_mostly enable_shadow_vmcs = 1;
100 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
101 /*
102  * If nested=1, nested virtualization is supported, i.e., guests may use
103  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
104  * use VMX instructions.
105  */
106 static bool __read_mostly nested = 0;
107 module_param(nested, bool, S_IRUGO);
108
109 static u64 __read_mostly host_xss;
110
111 static bool __read_mostly enable_pml = 1;
112 module_param_named(pml, enable_pml, bool, S_IRUGO);
113
114 #define MSR_TYPE_R      1
115 #define MSR_TYPE_W      2
116 #define MSR_TYPE_RW     3
117
118 #define MSR_BITMAP_MODE_X2APIC          1
119 #define MSR_BITMAP_MODE_X2APIC_APICV    2
120 #define MSR_BITMAP_MODE_LM              4
121
122 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
123
124 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
125 static int __read_mostly cpu_preemption_timer_multi;
126 static bool __read_mostly enable_preemption_timer = 1;
127 #ifdef CONFIG_X86_64
128 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
129 #endif
130
131 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
132 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
133 #define KVM_VM_CR0_ALWAYS_ON                                            \
134         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
135 #define KVM_CR4_GUEST_OWNED_BITS                                      \
136         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
137          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
138
139 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
140 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
141
142 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
143
144 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
145
146 /*
147  * Hyper-V requires all of these, so mark them as supported even though
148  * they are just treated the same as all-context.
149  */
150 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
151         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
152         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
153         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
154         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
155
156 /*
157  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
158  * ple_gap:    upper bound on the amount of time between two successive
159  *             executions of PAUSE in a loop. Also indicate if ple enabled.
160  *             According to test, this time is usually smaller than 128 cycles.
161  * ple_window: upper bound on the amount of time a guest is allowed to execute
162  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
163  *             less than 2^12 cycles
164  * Time is measured based on a counter that runs at the same rate as the TSC,
165  * refer SDM volume 3b section 21.6.13 & 22.1.3.
166  */
167 #define KVM_VMX_DEFAULT_PLE_GAP           128
168 #define KVM_VMX_DEFAULT_PLE_WINDOW        4096
169 #define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
170 #define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
171 #define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
172                 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
173
174 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
175 module_param(ple_gap, int, S_IRUGO);
176
177 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
178 module_param(ple_window, int, S_IRUGO);
179
180 /* Default doubles per-vcpu window every exit. */
181 static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
182 module_param(ple_window_grow, int, S_IRUGO);
183
184 /* Default resets per-vcpu window every exit to ple_window. */
185 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
186 module_param(ple_window_shrink, int, S_IRUGO);
187
188 /* Default is to compute the maximum so we can never overflow. */
189 static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
190 static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
191 module_param(ple_window_max, int, S_IRUGO);
192
193 extern const ulong vmx_return;
194
195 #define NR_AUTOLOAD_MSRS 8
196
197 struct vmcs {
198         u32 revision_id;
199         u32 abort;
200         char data[0];
201 };
202
203 /*
204  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
205  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
206  * loaded on this CPU (so we can clear them if the CPU goes down).
207  */
208 struct loaded_vmcs {
209         struct vmcs *vmcs;
210         struct vmcs *shadow_vmcs;
211         int cpu;
212         bool launched;
213         bool nmi_known_unmasked;
214         unsigned long vmcs_host_cr3;    /* May not match real cr3 */
215         unsigned long vmcs_host_cr4;    /* May not match real cr4 */
216         /* Support for vnmi-less CPUs */
217         int soft_vnmi_blocked;
218         ktime_t entry_time;
219         s64 vnmi_blocked_time;
220         unsigned long *msr_bitmap;
221         struct list_head loaded_vmcss_on_cpu_link;
222 };
223
224 struct shared_msr_entry {
225         unsigned index;
226         u64 data;
227         u64 mask;
228 };
229
230 /*
231  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
232  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
233  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
234  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
235  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
236  * More than one of these structures may exist, if L1 runs multiple L2 guests.
237  * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
238  * underlying hardware which will be used to run L2.
239  * This structure is packed to ensure that its layout is identical across
240  * machines (necessary for live migration).
241  * If there are changes in this struct, VMCS12_REVISION must be changed.
242  */
243 typedef u64 natural_width;
244 struct __packed vmcs12 {
245         /* According to the Intel spec, a VMCS region must start with the
246          * following two fields. Then follow implementation-specific data.
247          */
248         u32 revision_id;
249         u32 abort;
250
251         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
252         u32 padding[7]; /* room for future expansion */
253
254         u64 io_bitmap_a;
255         u64 io_bitmap_b;
256         u64 msr_bitmap;
257         u64 vm_exit_msr_store_addr;
258         u64 vm_exit_msr_load_addr;
259         u64 vm_entry_msr_load_addr;
260         u64 tsc_offset;
261         u64 virtual_apic_page_addr;
262         u64 apic_access_addr;
263         u64 posted_intr_desc_addr;
264         u64 vm_function_control;
265         u64 ept_pointer;
266         u64 eoi_exit_bitmap0;
267         u64 eoi_exit_bitmap1;
268         u64 eoi_exit_bitmap2;
269         u64 eoi_exit_bitmap3;
270         u64 eptp_list_address;
271         u64 xss_exit_bitmap;
272         u64 guest_physical_address;
273         u64 vmcs_link_pointer;
274         u64 pml_address;
275         u64 guest_ia32_debugctl;
276         u64 guest_ia32_pat;
277         u64 guest_ia32_efer;
278         u64 guest_ia32_perf_global_ctrl;
279         u64 guest_pdptr0;
280         u64 guest_pdptr1;
281         u64 guest_pdptr2;
282         u64 guest_pdptr3;
283         u64 guest_bndcfgs;
284         u64 host_ia32_pat;
285         u64 host_ia32_efer;
286         u64 host_ia32_perf_global_ctrl;
287         u64 padding64[8]; /* room for future expansion */
288         /*
289          * To allow migration of L1 (complete with its L2 guests) between
290          * machines of different natural widths (32 or 64 bit), we cannot have
291          * unsigned long fields with no explict size. We use u64 (aliased
292          * natural_width) instead. Luckily, x86 is little-endian.
293          */
294         natural_width cr0_guest_host_mask;
295         natural_width cr4_guest_host_mask;
296         natural_width cr0_read_shadow;
297         natural_width cr4_read_shadow;
298         natural_width cr3_target_value0;
299         natural_width cr3_target_value1;
300         natural_width cr3_target_value2;
301         natural_width cr3_target_value3;
302         natural_width exit_qualification;
303         natural_width guest_linear_address;
304         natural_width guest_cr0;
305         natural_width guest_cr3;
306         natural_width guest_cr4;
307         natural_width guest_es_base;
308         natural_width guest_cs_base;
309         natural_width guest_ss_base;
310         natural_width guest_ds_base;
311         natural_width guest_fs_base;
312         natural_width guest_gs_base;
313         natural_width guest_ldtr_base;
314         natural_width guest_tr_base;
315         natural_width guest_gdtr_base;
316         natural_width guest_idtr_base;
317         natural_width guest_dr7;
318         natural_width guest_rsp;
319         natural_width guest_rip;
320         natural_width guest_rflags;
321         natural_width guest_pending_dbg_exceptions;
322         natural_width guest_sysenter_esp;
323         natural_width guest_sysenter_eip;
324         natural_width host_cr0;
325         natural_width host_cr3;
326         natural_width host_cr4;
327         natural_width host_fs_base;
328         natural_width host_gs_base;
329         natural_width host_tr_base;
330         natural_width host_gdtr_base;
331         natural_width host_idtr_base;
332         natural_width host_ia32_sysenter_esp;
333         natural_width host_ia32_sysenter_eip;
334         natural_width host_rsp;
335         natural_width host_rip;
336         natural_width paddingl[8]; /* room for future expansion */
337         u32 pin_based_vm_exec_control;
338         u32 cpu_based_vm_exec_control;
339         u32 exception_bitmap;
340         u32 page_fault_error_code_mask;
341         u32 page_fault_error_code_match;
342         u32 cr3_target_count;
343         u32 vm_exit_controls;
344         u32 vm_exit_msr_store_count;
345         u32 vm_exit_msr_load_count;
346         u32 vm_entry_controls;
347         u32 vm_entry_msr_load_count;
348         u32 vm_entry_intr_info_field;
349         u32 vm_entry_exception_error_code;
350         u32 vm_entry_instruction_len;
351         u32 tpr_threshold;
352         u32 secondary_vm_exec_control;
353         u32 vm_instruction_error;
354         u32 vm_exit_reason;
355         u32 vm_exit_intr_info;
356         u32 vm_exit_intr_error_code;
357         u32 idt_vectoring_info_field;
358         u32 idt_vectoring_error_code;
359         u32 vm_exit_instruction_len;
360         u32 vmx_instruction_info;
361         u32 guest_es_limit;
362         u32 guest_cs_limit;
363         u32 guest_ss_limit;
364         u32 guest_ds_limit;
365         u32 guest_fs_limit;
366         u32 guest_gs_limit;
367         u32 guest_ldtr_limit;
368         u32 guest_tr_limit;
369         u32 guest_gdtr_limit;
370         u32 guest_idtr_limit;
371         u32 guest_es_ar_bytes;
372         u32 guest_cs_ar_bytes;
373         u32 guest_ss_ar_bytes;
374         u32 guest_ds_ar_bytes;
375         u32 guest_fs_ar_bytes;
376         u32 guest_gs_ar_bytes;
377         u32 guest_ldtr_ar_bytes;
378         u32 guest_tr_ar_bytes;
379         u32 guest_interruptibility_info;
380         u32 guest_activity_state;
381         u32 guest_sysenter_cs;
382         u32 host_ia32_sysenter_cs;
383         u32 vmx_preemption_timer_value;
384         u32 padding32[7]; /* room for future expansion */
385         u16 virtual_processor_id;
386         u16 posted_intr_nv;
387         u16 guest_es_selector;
388         u16 guest_cs_selector;
389         u16 guest_ss_selector;
390         u16 guest_ds_selector;
391         u16 guest_fs_selector;
392         u16 guest_gs_selector;
393         u16 guest_ldtr_selector;
394         u16 guest_tr_selector;
395         u16 guest_intr_status;
396         u16 guest_pml_index;
397         u16 host_es_selector;
398         u16 host_cs_selector;
399         u16 host_ss_selector;
400         u16 host_ds_selector;
401         u16 host_fs_selector;
402         u16 host_gs_selector;
403         u16 host_tr_selector;
404 };
405
406 /*
407  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
408  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
409  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
410  */
411 #define VMCS12_REVISION 0x11e57ed0
412
413 /*
414  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
415  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
416  * current implementation, 4K are reserved to avoid future complications.
417  */
418 #define VMCS12_SIZE 0x1000
419
420 /*
421  * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
422  * supported VMCS12 field encoding.
423  */
424 #define VMCS12_MAX_FIELD_INDEX 0x17
425
426 /*
427  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
428  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
429  */
430 struct nested_vmx {
431         /* Has the level1 guest done vmxon? */
432         bool vmxon;
433         gpa_t vmxon_ptr;
434         bool pml_full;
435
436         /* The guest-physical address of the current VMCS L1 keeps for L2 */
437         gpa_t current_vmptr;
438         /*
439          * Cache of the guest's VMCS, existing outside of guest memory.
440          * Loaded from guest memory during VMPTRLD. Flushed to guest
441          * memory during VMCLEAR and VMPTRLD.
442          */
443         struct vmcs12 *cached_vmcs12;
444         /*
445          * Indicates if the shadow vmcs must be updated with the
446          * data hold by vmcs12
447          */
448         bool sync_shadow_vmcs;
449         bool dirty_vmcs12;
450
451         bool change_vmcs01_virtual_x2apic_mode;
452         /* L2 must run next, and mustn't decide to exit to L1. */
453         bool nested_run_pending;
454
455         struct loaded_vmcs vmcs02;
456
457         /*
458          * Guest pages referred to in the vmcs02 with host-physical
459          * pointers, so we must keep them pinned while L2 runs.
460          */
461         struct page *apic_access_page;
462         struct page *virtual_apic_page;
463         struct page *pi_desc_page;
464         struct pi_desc *pi_desc;
465         bool pi_pending;
466         u16 posted_intr_nv;
467
468         struct hrtimer preemption_timer;
469         bool preemption_timer_expired;
470
471         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
472         u64 vmcs01_debugctl;
473
474         u16 vpid02;
475         u16 last_vpid;
476
477         /*
478          * We only store the "true" versions of the VMX capability MSRs. We
479          * generate the "non-true" versions by setting the must-be-1 bits
480          * according to the SDM.
481          */
482         u32 nested_vmx_procbased_ctls_low;
483         u32 nested_vmx_procbased_ctls_high;
484         u32 nested_vmx_secondary_ctls_low;
485         u32 nested_vmx_secondary_ctls_high;
486         u32 nested_vmx_pinbased_ctls_low;
487         u32 nested_vmx_pinbased_ctls_high;
488         u32 nested_vmx_exit_ctls_low;
489         u32 nested_vmx_exit_ctls_high;
490         u32 nested_vmx_entry_ctls_low;
491         u32 nested_vmx_entry_ctls_high;
492         u32 nested_vmx_misc_low;
493         u32 nested_vmx_misc_high;
494         u32 nested_vmx_ept_caps;
495         u32 nested_vmx_vpid_caps;
496         u64 nested_vmx_basic;
497         u64 nested_vmx_cr0_fixed0;
498         u64 nested_vmx_cr0_fixed1;
499         u64 nested_vmx_cr4_fixed0;
500         u64 nested_vmx_cr4_fixed1;
501         u64 nested_vmx_vmcs_enum;
502         u64 nested_vmx_vmfunc_controls;
503
504         /* SMM related state */
505         struct {
506                 /* in VMX operation on SMM entry? */
507                 bool vmxon;
508                 /* in guest mode on SMM entry? */
509                 bool guest_mode;
510         } smm;
511 };
512
513 #define POSTED_INTR_ON  0
514 #define POSTED_INTR_SN  1
515
516 /* Posted-Interrupt Descriptor */
517 struct pi_desc {
518         u32 pir[8];     /* Posted interrupt requested */
519         union {
520                 struct {
521                                 /* bit 256 - Outstanding Notification */
522                         u16     on      : 1,
523                                 /* bit 257 - Suppress Notification */
524                                 sn      : 1,
525                                 /* bit 271:258 - Reserved */
526                                 rsvd_1  : 14;
527                                 /* bit 279:272 - Notification Vector */
528                         u8      nv;
529                                 /* bit 287:280 - Reserved */
530                         u8      rsvd_2;
531                                 /* bit 319:288 - Notification Destination */
532                         u32     ndst;
533                 };
534                 u64 control;
535         };
536         u32 rsvd[6];
537 } __aligned(64);
538
539 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
540 {
541         return test_and_set_bit(POSTED_INTR_ON,
542                         (unsigned long *)&pi_desc->control);
543 }
544
545 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
546 {
547         return test_and_clear_bit(POSTED_INTR_ON,
548                         (unsigned long *)&pi_desc->control);
549 }
550
551 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
552 {
553         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
554 }
555
556 static inline void pi_clear_sn(struct pi_desc *pi_desc)
557 {
558         return clear_bit(POSTED_INTR_SN,
559                         (unsigned long *)&pi_desc->control);
560 }
561
562 static inline void pi_set_sn(struct pi_desc *pi_desc)
563 {
564         return set_bit(POSTED_INTR_SN,
565                         (unsigned long *)&pi_desc->control);
566 }
567
568 static inline void pi_clear_on(struct pi_desc *pi_desc)
569 {
570         clear_bit(POSTED_INTR_ON,
571                   (unsigned long *)&pi_desc->control);
572 }
573
574 static inline int pi_test_on(struct pi_desc *pi_desc)
575 {
576         return test_bit(POSTED_INTR_ON,
577                         (unsigned long *)&pi_desc->control);
578 }
579
580 static inline int pi_test_sn(struct pi_desc *pi_desc)
581 {
582         return test_bit(POSTED_INTR_SN,
583                         (unsigned long *)&pi_desc->control);
584 }
585
586 struct vcpu_vmx {
587         struct kvm_vcpu       vcpu;
588         unsigned long         host_rsp;
589         u8                    fail;
590         u8                    msr_bitmap_mode;
591         u32                   exit_intr_info;
592         u32                   idt_vectoring_info;
593         ulong                 rflags;
594         struct shared_msr_entry *guest_msrs;
595         int                   nmsrs;
596         int                   save_nmsrs;
597         unsigned long         host_idt_base;
598 #ifdef CONFIG_X86_64
599         u64                   msr_host_kernel_gs_base;
600         u64                   msr_guest_kernel_gs_base;
601 #endif
602         u32 vm_entry_controls_shadow;
603         u32 vm_exit_controls_shadow;
604         u32 secondary_exec_control;
605
606         /*
607          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
608          * non-nested (L1) guest, it always points to vmcs01. For a nested
609          * guest (L2), it points to a different VMCS.
610          */
611         struct loaded_vmcs    vmcs01;
612         struct loaded_vmcs   *loaded_vmcs;
613         bool                  __launched; /* temporary, used in vmx_vcpu_run */
614         struct msr_autoload {
615                 unsigned nr;
616                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
617                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
618         } msr_autoload;
619         struct {
620                 int           loaded;
621                 u16           fs_sel, gs_sel, ldt_sel;
622 #ifdef CONFIG_X86_64
623                 u16           ds_sel, es_sel;
624 #endif
625                 int           gs_ldt_reload_needed;
626                 int           fs_reload_needed;
627                 u64           msr_host_bndcfgs;
628         } host_state;
629         struct {
630                 int vm86_active;
631                 ulong save_rflags;
632                 struct kvm_segment segs[8];
633         } rmode;
634         struct {
635                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
636                 struct kvm_save_segment {
637                         u16 selector;
638                         unsigned long base;
639                         u32 limit;
640                         u32 ar;
641                 } seg[8];
642         } segment_cache;
643         int vpid;
644         bool emulation_required;
645
646         u32 exit_reason;
647
648         /* Posted interrupt descriptor */
649         struct pi_desc pi_desc;
650
651         /* Support for a guest hypervisor (nested VMX) */
652         struct nested_vmx nested;
653
654         /* Dynamic PLE window. */
655         int ple_window;
656         bool ple_window_dirty;
657
658         /* Support for PML */
659 #define PML_ENTITY_NUM          512
660         struct page *pml_pg;
661
662         /* apic deadline value in host tsc */
663         u64 hv_deadline_tsc;
664
665         u64 current_tsc_ratio;
666
667         u32 host_pkru;
668
669         unsigned long host_debugctlmsr;
670
671         /*
672          * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
673          * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
674          * in msr_ia32_feature_control_valid_bits.
675          */
676         u64 msr_ia32_feature_control;
677         u64 msr_ia32_feature_control_valid_bits;
678 };
679
680 enum segment_cache_field {
681         SEG_FIELD_SEL = 0,
682         SEG_FIELD_BASE = 1,
683         SEG_FIELD_LIMIT = 2,
684         SEG_FIELD_AR = 3,
685
686         SEG_FIELD_NR = 4
687 };
688
689 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
690 {
691         return container_of(vcpu, struct vcpu_vmx, vcpu);
692 }
693
694 static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
695 {
696         return &(to_vmx(vcpu)->pi_desc);
697 }
698
699 #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
700 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
701 #define FIELD(number, name)     [ROL16(number, 6)] = VMCS12_OFFSET(name)
702 #define FIELD64(number, name)                                           \
703         FIELD(number, name),                                            \
704         [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
705
706
707 static u16 shadow_read_only_fields[] = {
708 #define SHADOW_FIELD_RO(x) x,
709 #include "vmx_shadow_fields.h"
710 };
711 static int max_shadow_read_only_fields =
712         ARRAY_SIZE(shadow_read_only_fields);
713
714 static u16 shadow_read_write_fields[] = {
715 #define SHADOW_FIELD_RW(x) x,
716 #include "vmx_shadow_fields.h"
717 };
718 static int max_shadow_read_write_fields =
719         ARRAY_SIZE(shadow_read_write_fields);
720
721 static const unsigned short vmcs_field_to_offset_table[] = {
722         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
723         FIELD(POSTED_INTR_NV, posted_intr_nv),
724         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
725         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
726         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
727         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
728         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
729         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
730         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
731         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
732         FIELD(GUEST_INTR_STATUS, guest_intr_status),
733         FIELD(GUEST_PML_INDEX, guest_pml_index),
734         FIELD(HOST_ES_SELECTOR, host_es_selector),
735         FIELD(HOST_CS_SELECTOR, host_cs_selector),
736         FIELD(HOST_SS_SELECTOR, host_ss_selector),
737         FIELD(HOST_DS_SELECTOR, host_ds_selector),
738         FIELD(HOST_FS_SELECTOR, host_fs_selector),
739         FIELD(HOST_GS_SELECTOR, host_gs_selector),
740         FIELD(HOST_TR_SELECTOR, host_tr_selector),
741         FIELD64(IO_BITMAP_A, io_bitmap_a),
742         FIELD64(IO_BITMAP_B, io_bitmap_b),
743         FIELD64(MSR_BITMAP, msr_bitmap),
744         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
745         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
746         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
747         FIELD64(TSC_OFFSET, tsc_offset),
748         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
749         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
750         FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
751         FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
752         FIELD64(EPT_POINTER, ept_pointer),
753         FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
754         FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
755         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
756         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
757         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
758         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
759         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
760         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
761         FIELD64(PML_ADDRESS, pml_address),
762         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
763         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
764         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
765         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
766         FIELD64(GUEST_PDPTR0, guest_pdptr0),
767         FIELD64(GUEST_PDPTR1, guest_pdptr1),
768         FIELD64(GUEST_PDPTR2, guest_pdptr2),
769         FIELD64(GUEST_PDPTR3, guest_pdptr3),
770         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
771         FIELD64(HOST_IA32_PAT, host_ia32_pat),
772         FIELD64(HOST_IA32_EFER, host_ia32_efer),
773         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
774         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
775         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
776         FIELD(EXCEPTION_BITMAP, exception_bitmap),
777         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
778         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
779         FIELD(CR3_TARGET_COUNT, cr3_target_count),
780         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
781         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
782         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
783         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
784         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
785         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
786         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
787         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
788         FIELD(TPR_THRESHOLD, tpr_threshold),
789         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
790         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
791         FIELD(VM_EXIT_REASON, vm_exit_reason),
792         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
793         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
794         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
795         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
796         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
797         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
798         FIELD(GUEST_ES_LIMIT, guest_es_limit),
799         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
800         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
801         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
802         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
803         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
804         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
805         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
806         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
807         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
808         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
809         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
810         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
811         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
812         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
813         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
814         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
815         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
816         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
817         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
818         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
819         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
820         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
821         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
822         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
823         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
824         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
825         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
826         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
827         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
828         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
829         FIELD(EXIT_QUALIFICATION, exit_qualification),
830         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
831         FIELD(GUEST_CR0, guest_cr0),
832         FIELD(GUEST_CR3, guest_cr3),
833         FIELD(GUEST_CR4, guest_cr4),
834         FIELD(GUEST_ES_BASE, guest_es_base),
835         FIELD(GUEST_CS_BASE, guest_cs_base),
836         FIELD(GUEST_SS_BASE, guest_ss_base),
837         FIELD(GUEST_DS_BASE, guest_ds_base),
838         FIELD(GUEST_FS_BASE, guest_fs_base),
839         FIELD(GUEST_GS_BASE, guest_gs_base),
840         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
841         FIELD(GUEST_TR_BASE, guest_tr_base),
842         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
843         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
844         FIELD(GUEST_DR7, guest_dr7),
845         FIELD(GUEST_RSP, guest_rsp),
846         FIELD(GUEST_RIP, guest_rip),
847         FIELD(GUEST_RFLAGS, guest_rflags),
848         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
849         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
850         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
851         FIELD(HOST_CR0, host_cr0),
852         FIELD(HOST_CR3, host_cr3),
853         FIELD(HOST_CR4, host_cr4),
854         FIELD(HOST_FS_BASE, host_fs_base),
855         FIELD(HOST_GS_BASE, host_gs_base),
856         FIELD(HOST_TR_BASE, host_tr_base),
857         FIELD(HOST_GDTR_BASE, host_gdtr_base),
858         FIELD(HOST_IDTR_BASE, host_idtr_base),
859         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
860         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
861         FIELD(HOST_RSP, host_rsp),
862         FIELD(HOST_RIP, host_rip),
863 };
864
865 static inline short vmcs_field_to_offset(unsigned long field)
866 {
867         unsigned index;
868
869         if (field >> 15)
870                 return -ENOENT;
871
872         index = ROL16(field, 6);
873         if (index >= ARRAY_SIZE(vmcs_field_to_offset_table))
874                 return -ENOENT;
875
876         /*
877          * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
878          * generic mechanism.
879          */
880         asm("lfence");
881
882         if (vmcs_field_to_offset_table[index] == 0)
883                 return -ENOENT;
884
885         return vmcs_field_to_offset_table[index];
886 }
887
888 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
889 {
890         return to_vmx(vcpu)->nested.cached_vmcs12;
891 }
892
893 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
894 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
895 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
896 static bool vmx_xsaves_supported(void);
897 static void vmx_set_segment(struct kvm_vcpu *vcpu,
898                             struct kvm_segment *var, int seg);
899 static void vmx_get_segment(struct kvm_vcpu *vcpu,
900                             struct kvm_segment *var, int seg);
901 static bool guest_state_valid(struct kvm_vcpu *vcpu);
902 static u32 vmx_segment_access_rights(struct kvm_segment *var);
903 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
904 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
905 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
906 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
907                                             u16 error_code);
908 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
909
910 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
911 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
912 /*
913  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
914  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
915  */
916 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
917
918 /*
919  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
920  * can find which vCPU should be waken up.
921  */
922 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
923 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
924
925 enum {
926         VMX_VMREAD_BITMAP,
927         VMX_VMWRITE_BITMAP,
928         VMX_BITMAP_NR
929 };
930
931 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
932
933 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
934 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
935
936 static bool cpu_has_load_ia32_efer;
937 static bool cpu_has_load_perf_global_ctrl;
938
939 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
940 static DEFINE_SPINLOCK(vmx_vpid_lock);
941
942 static struct vmcs_config {
943         int size;
944         int order;
945         u32 basic_cap;
946         u32 revision_id;
947         u32 pin_based_exec_ctrl;
948         u32 cpu_based_exec_ctrl;
949         u32 cpu_based_2nd_exec_ctrl;
950         u32 vmexit_ctrl;
951         u32 vmentry_ctrl;
952 } vmcs_config;
953
954 static struct vmx_capability {
955         u32 ept;
956         u32 vpid;
957 } vmx_capability;
958
959 #define VMX_SEGMENT_FIELD(seg)                                  \
960         [VCPU_SREG_##seg] = {                                   \
961                 .selector = GUEST_##seg##_SELECTOR,             \
962                 .base = GUEST_##seg##_BASE,                     \
963                 .limit = GUEST_##seg##_LIMIT,                   \
964                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
965         }
966
967 static const struct kvm_vmx_segment_field {
968         unsigned selector;
969         unsigned base;
970         unsigned limit;
971         unsigned ar_bytes;
972 } kvm_vmx_segment_fields[] = {
973         VMX_SEGMENT_FIELD(CS),
974         VMX_SEGMENT_FIELD(DS),
975         VMX_SEGMENT_FIELD(ES),
976         VMX_SEGMENT_FIELD(FS),
977         VMX_SEGMENT_FIELD(GS),
978         VMX_SEGMENT_FIELD(SS),
979         VMX_SEGMENT_FIELD(TR),
980         VMX_SEGMENT_FIELD(LDTR),
981 };
982
983 static u64 host_efer;
984
985 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
986
987 /*
988  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
989  * away by decrementing the array size.
990  */
991 static const u32 vmx_msr_index[] = {
992 #ifdef CONFIG_X86_64
993         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
994 #endif
995         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
996 };
997
998 static inline bool is_exception_n(u32 intr_info, u8 vector)
999 {
1000         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1001                              INTR_INFO_VALID_MASK)) ==
1002                 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1003 }
1004
1005 static inline bool is_debug(u32 intr_info)
1006 {
1007         return is_exception_n(intr_info, DB_VECTOR);
1008 }
1009
1010 static inline bool is_breakpoint(u32 intr_info)
1011 {
1012         return is_exception_n(intr_info, BP_VECTOR);
1013 }
1014
1015 static inline bool is_page_fault(u32 intr_info)
1016 {
1017         return is_exception_n(intr_info, PF_VECTOR);
1018 }
1019
1020 static inline bool is_no_device(u32 intr_info)
1021 {
1022         return is_exception_n(intr_info, NM_VECTOR);
1023 }
1024
1025 static inline bool is_invalid_opcode(u32 intr_info)
1026 {
1027         return is_exception_n(intr_info, UD_VECTOR);
1028 }
1029
1030 static inline bool is_external_interrupt(u32 intr_info)
1031 {
1032         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1033                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1034 }
1035
1036 static inline bool is_machine_check(u32 intr_info)
1037 {
1038         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1039                              INTR_INFO_VALID_MASK)) ==
1040                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1041 }
1042
1043 static inline bool cpu_has_vmx_msr_bitmap(void)
1044 {
1045         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1046 }
1047
1048 static inline bool cpu_has_vmx_tpr_shadow(void)
1049 {
1050         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1051 }
1052
1053 static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1054 {
1055         return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1056 }
1057
1058 static inline bool cpu_has_secondary_exec_ctrls(void)
1059 {
1060         return vmcs_config.cpu_based_exec_ctrl &
1061                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1062 }
1063
1064 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1065 {
1066         return vmcs_config.cpu_based_2nd_exec_ctrl &
1067                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1068 }
1069
1070 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1071 {
1072         return vmcs_config.cpu_based_2nd_exec_ctrl &
1073                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1074 }
1075
1076 static inline bool cpu_has_vmx_apic_register_virt(void)
1077 {
1078         return vmcs_config.cpu_based_2nd_exec_ctrl &
1079                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1080 }
1081
1082 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1083 {
1084         return vmcs_config.cpu_based_2nd_exec_ctrl &
1085                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1086 }
1087
1088 /*
1089  * Comment's format: document - errata name - stepping - processor name.
1090  * Refer from
1091  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1092  */
1093 static u32 vmx_preemption_cpu_tfms[] = {
1094 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
1095 0x000206E6,
1096 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
1097 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1098 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1099 0x00020652,
1100 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1101 0x00020655,
1102 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
1103 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
1104 /*
1105  * 320767.pdf - AAP86  - B1 -
1106  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1107  */
1108 0x000106E5,
1109 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1110 0x000106A0,
1111 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1112 0x000106A1,
1113 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1114 0x000106A4,
1115  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1116  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1117  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1118 0x000106A5,
1119 };
1120
1121 static inline bool cpu_has_broken_vmx_preemption_timer(void)
1122 {
1123         u32 eax = cpuid_eax(0x00000001), i;
1124
1125         /* Clear the reserved bits */
1126         eax &= ~(0x3U << 14 | 0xfU << 28);
1127         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1128                 if (eax == vmx_preemption_cpu_tfms[i])
1129                         return true;
1130
1131         return false;
1132 }
1133
1134 static inline bool cpu_has_vmx_preemption_timer(void)
1135 {
1136         return vmcs_config.pin_based_exec_ctrl &
1137                 PIN_BASED_VMX_PREEMPTION_TIMER;
1138 }
1139
1140 static inline bool cpu_has_vmx_posted_intr(void)
1141 {
1142         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1143                 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1144 }
1145
1146 static inline bool cpu_has_vmx_apicv(void)
1147 {
1148         return cpu_has_vmx_apic_register_virt() &&
1149                 cpu_has_vmx_virtual_intr_delivery() &&
1150                 cpu_has_vmx_posted_intr();
1151 }
1152
1153 static inline bool cpu_has_vmx_flexpriority(void)
1154 {
1155         return cpu_has_vmx_tpr_shadow() &&
1156                 cpu_has_vmx_virtualize_apic_accesses();
1157 }
1158
1159 static inline bool cpu_has_vmx_ept_execute_only(void)
1160 {
1161         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1162 }
1163
1164 static inline bool cpu_has_vmx_ept_2m_page(void)
1165 {
1166         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1167 }
1168
1169 static inline bool cpu_has_vmx_ept_1g_page(void)
1170 {
1171         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1172 }
1173
1174 static inline bool cpu_has_vmx_ept_4levels(void)
1175 {
1176         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1177 }
1178
1179 static inline bool cpu_has_vmx_ept_mt_wb(void)
1180 {
1181         return vmx_capability.ept & VMX_EPTP_WB_BIT;
1182 }
1183
1184 static inline bool cpu_has_vmx_ept_5levels(void)
1185 {
1186         return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1187 }
1188
1189 static inline bool cpu_has_vmx_ept_ad_bits(void)
1190 {
1191         return vmx_capability.ept & VMX_EPT_AD_BIT;
1192 }
1193
1194 static inline bool cpu_has_vmx_invept_context(void)
1195 {
1196         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1197 }
1198
1199 static inline bool cpu_has_vmx_invept_global(void)
1200 {
1201         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1202 }
1203
1204 static inline bool cpu_has_vmx_invvpid_single(void)
1205 {
1206         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1207 }
1208
1209 static inline bool cpu_has_vmx_invvpid_global(void)
1210 {
1211         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1212 }
1213
1214 static inline bool cpu_has_vmx_invvpid(void)
1215 {
1216         return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1217 }
1218
1219 static inline bool cpu_has_vmx_ept(void)
1220 {
1221         return vmcs_config.cpu_based_2nd_exec_ctrl &
1222                 SECONDARY_EXEC_ENABLE_EPT;
1223 }
1224
1225 static inline bool cpu_has_vmx_unrestricted_guest(void)
1226 {
1227         return vmcs_config.cpu_based_2nd_exec_ctrl &
1228                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1229 }
1230
1231 static inline bool cpu_has_vmx_ple(void)
1232 {
1233         return vmcs_config.cpu_based_2nd_exec_ctrl &
1234                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1235 }
1236
1237 static inline bool cpu_has_vmx_basic_inout(void)
1238 {
1239         return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1240 }
1241
1242 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1243 {
1244         return flexpriority_enabled && lapic_in_kernel(vcpu);
1245 }
1246
1247 static inline bool cpu_has_vmx_vpid(void)
1248 {
1249         return vmcs_config.cpu_based_2nd_exec_ctrl &
1250                 SECONDARY_EXEC_ENABLE_VPID;
1251 }
1252
1253 static inline bool cpu_has_vmx_rdtscp(void)
1254 {
1255         return vmcs_config.cpu_based_2nd_exec_ctrl &
1256                 SECONDARY_EXEC_RDTSCP;
1257 }
1258
1259 static inline bool cpu_has_vmx_invpcid(void)
1260 {
1261         return vmcs_config.cpu_based_2nd_exec_ctrl &
1262                 SECONDARY_EXEC_ENABLE_INVPCID;
1263 }
1264
1265 static inline bool cpu_has_virtual_nmis(void)
1266 {
1267         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1268 }
1269
1270 static inline bool cpu_has_vmx_wbinvd_exit(void)
1271 {
1272         return vmcs_config.cpu_based_2nd_exec_ctrl &
1273                 SECONDARY_EXEC_WBINVD_EXITING;
1274 }
1275
1276 static inline bool cpu_has_vmx_shadow_vmcs(void)
1277 {
1278         u64 vmx_msr;
1279         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1280         /* check if the cpu supports writing r/o exit information fields */
1281         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1282                 return false;
1283
1284         return vmcs_config.cpu_based_2nd_exec_ctrl &
1285                 SECONDARY_EXEC_SHADOW_VMCS;
1286 }
1287
1288 static inline bool cpu_has_vmx_pml(void)
1289 {
1290         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1291 }
1292
1293 static inline bool cpu_has_vmx_tsc_scaling(void)
1294 {
1295         return vmcs_config.cpu_based_2nd_exec_ctrl &
1296                 SECONDARY_EXEC_TSC_SCALING;
1297 }
1298
1299 static inline bool cpu_has_vmx_vmfunc(void)
1300 {
1301         return vmcs_config.cpu_based_2nd_exec_ctrl &
1302                 SECONDARY_EXEC_ENABLE_VMFUNC;
1303 }
1304
1305 static inline bool report_flexpriority(void)
1306 {
1307         return flexpriority_enabled;
1308 }
1309
1310 static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1311 {
1312         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
1313 }
1314
1315 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1316 {
1317         return vmcs12->cpu_based_vm_exec_control & bit;
1318 }
1319
1320 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1321 {
1322         return (vmcs12->cpu_based_vm_exec_control &
1323                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1324                 (vmcs12->secondary_vm_exec_control & bit);
1325 }
1326
1327 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1328 {
1329         return vmcs12->pin_based_vm_exec_control &
1330                 PIN_BASED_VMX_PREEMPTION_TIMER;
1331 }
1332
1333 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1334 {
1335         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1336 }
1337
1338 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1339 {
1340         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
1341 }
1342
1343 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
1344 {
1345         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
1346 }
1347
1348 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1349 {
1350         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1351 }
1352
1353 static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1354 {
1355         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1356 }
1357
1358 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1359 {
1360         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1361 }
1362
1363 static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1364 {
1365         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1366 }
1367
1368 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1369 {
1370         return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1371 }
1372
1373 static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
1374 {
1375         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
1376 }
1377
1378 static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
1379 {
1380         return nested_cpu_has_vmfunc(vmcs12) &&
1381                 (vmcs12->vm_function_control &
1382                  VMX_VMFUNC_EPTP_SWITCHING);
1383 }
1384
1385 static inline bool is_nmi(u32 intr_info)
1386 {
1387         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1388                 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
1389 }
1390
1391 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1392                               u32 exit_intr_info,
1393                               unsigned long exit_qualification);
1394 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1395                         struct vmcs12 *vmcs12,
1396                         u32 reason, unsigned long qualification);
1397
1398 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1399 {
1400         int i;
1401
1402         for (i = 0; i < vmx->nmsrs; ++i)
1403                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1404                         return i;
1405         return -1;
1406 }
1407
1408 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1409 {
1410     struct {
1411         u64 vpid : 16;
1412         u64 rsvd : 48;
1413         u64 gva;
1414     } operand = { vpid, 0, gva };
1415
1416     asm volatile (__ex(ASM_VMX_INVVPID)
1417                   /* CF==1 or ZF==1 --> rc = -1 */
1418                   "; ja 1f ; ud2 ; 1:"
1419                   : : "a"(&operand), "c"(ext) : "cc", "memory");
1420 }
1421
1422 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1423 {
1424         struct {
1425                 u64 eptp, gpa;
1426         } operand = {eptp, gpa};
1427
1428         asm volatile (__ex(ASM_VMX_INVEPT)
1429                         /* CF==1 or ZF==1 --> rc = -1 */
1430                         "; ja 1f ; ud2 ; 1:\n"
1431                         : : "a" (&operand), "c" (ext) : "cc", "memory");
1432 }
1433
1434 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1435 {
1436         int i;
1437
1438         i = __find_msr_index(vmx, msr);
1439         if (i >= 0)
1440                 return &vmx->guest_msrs[i];
1441         return NULL;
1442 }
1443
1444 static void vmcs_clear(struct vmcs *vmcs)
1445 {
1446         u64 phys_addr = __pa(vmcs);
1447         u8 error;
1448
1449         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1450                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1451                       : "cc", "memory");
1452         if (error)
1453                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1454                        vmcs, phys_addr);
1455 }
1456
1457 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1458 {
1459         vmcs_clear(loaded_vmcs->vmcs);
1460         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1461                 vmcs_clear(loaded_vmcs->shadow_vmcs);
1462         loaded_vmcs->cpu = -1;
1463         loaded_vmcs->launched = 0;
1464 }
1465
1466 static void vmcs_load(struct vmcs *vmcs)
1467 {
1468         u64 phys_addr = __pa(vmcs);
1469         u8 error;
1470
1471         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1472                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1473                         : "cc", "memory");
1474         if (error)
1475                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1476                        vmcs, phys_addr);
1477 }
1478
1479 #ifdef CONFIG_KEXEC_CORE
1480 /*
1481  * This bitmap is used to indicate whether the vmclear
1482  * operation is enabled on all cpus. All disabled by
1483  * default.
1484  */
1485 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1486
1487 static inline void crash_enable_local_vmclear(int cpu)
1488 {
1489         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1490 }
1491
1492 static inline void crash_disable_local_vmclear(int cpu)
1493 {
1494         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1495 }
1496
1497 static inline int crash_local_vmclear_enabled(int cpu)
1498 {
1499         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1500 }
1501
1502 static void crash_vmclear_local_loaded_vmcss(void)
1503 {
1504         int cpu = raw_smp_processor_id();
1505         struct loaded_vmcs *v;
1506
1507         if (!crash_local_vmclear_enabled(cpu))
1508                 return;
1509
1510         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1511                             loaded_vmcss_on_cpu_link)
1512                 vmcs_clear(v->vmcs);
1513 }
1514 #else
1515 static inline void crash_enable_local_vmclear(int cpu) { }
1516 static inline void crash_disable_local_vmclear(int cpu) { }
1517 #endif /* CONFIG_KEXEC_CORE */
1518
1519 static void __loaded_vmcs_clear(void *arg)
1520 {
1521         struct loaded_vmcs *loaded_vmcs = arg;
1522         int cpu = raw_smp_processor_id();
1523
1524         if (loaded_vmcs->cpu != cpu)
1525                 return; /* vcpu migration can race with cpu offline */
1526         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1527                 per_cpu(current_vmcs, cpu) = NULL;
1528         crash_disable_local_vmclear(cpu);
1529         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1530
1531         /*
1532          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1533          * is before setting loaded_vmcs->vcpu to -1 which is done in
1534          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1535          * then adds the vmcs into percpu list before it is deleted.
1536          */
1537         smp_wmb();
1538
1539         loaded_vmcs_init(loaded_vmcs);
1540         crash_enable_local_vmclear(cpu);
1541 }
1542
1543 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1544 {
1545         int cpu = loaded_vmcs->cpu;
1546
1547         if (cpu != -1)
1548                 smp_call_function_single(cpu,
1549                          __loaded_vmcs_clear, loaded_vmcs, 1);
1550 }
1551
1552 static inline void vpid_sync_vcpu_single(int vpid)
1553 {
1554         if (vpid == 0)
1555                 return;
1556
1557         if (cpu_has_vmx_invvpid_single())
1558                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1559 }
1560
1561 static inline void vpid_sync_vcpu_global(void)
1562 {
1563         if (cpu_has_vmx_invvpid_global())
1564                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1565 }
1566
1567 static inline void vpid_sync_context(int vpid)
1568 {
1569         if (cpu_has_vmx_invvpid_single())
1570                 vpid_sync_vcpu_single(vpid);
1571         else
1572                 vpid_sync_vcpu_global();
1573 }
1574
1575 static inline void ept_sync_global(void)
1576 {
1577         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1578 }
1579
1580 static inline void ept_sync_context(u64 eptp)
1581 {
1582         if (cpu_has_vmx_invept_context())
1583                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1584         else
1585                 ept_sync_global();
1586 }
1587
1588 static __always_inline void vmcs_check16(unsigned long field)
1589 {
1590         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1591                          "16-bit accessor invalid for 64-bit field");
1592         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1593                          "16-bit accessor invalid for 64-bit high field");
1594         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1595                          "16-bit accessor invalid for 32-bit high field");
1596         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1597                          "16-bit accessor invalid for natural width field");
1598 }
1599
1600 static __always_inline void vmcs_check32(unsigned long field)
1601 {
1602         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1603                          "32-bit accessor invalid for 16-bit field");
1604         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1605                          "32-bit accessor invalid for natural width field");
1606 }
1607
1608 static __always_inline void vmcs_check64(unsigned long field)
1609 {
1610         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1611                          "64-bit accessor invalid for 16-bit field");
1612         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1613                          "64-bit accessor invalid for 64-bit high field");
1614         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1615                          "64-bit accessor invalid for 32-bit field");
1616         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1617                          "64-bit accessor invalid for natural width field");
1618 }
1619
1620 static __always_inline void vmcs_checkl(unsigned long field)
1621 {
1622         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1623                          "Natural width accessor invalid for 16-bit field");
1624         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1625                          "Natural width accessor invalid for 64-bit field");
1626         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1627                          "Natural width accessor invalid for 64-bit high field");
1628         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1629                          "Natural width accessor invalid for 32-bit field");
1630 }
1631
1632 static __always_inline unsigned long __vmcs_readl(unsigned long field)
1633 {
1634         unsigned long value;
1635
1636         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1637                       : "=a"(value) : "d"(field) : "cc");
1638         return value;
1639 }
1640
1641 static __always_inline u16 vmcs_read16(unsigned long field)
1642 {
1643         vmcs_check16(field);
1644         return __vmcs_readl(field);
1645 }
1646
1647 static __always_inline u32 vmcs_read32(unsigned long field)
1648 {
1649         vmcs_check32(field);
1650         return __vmcs_readl(field);
1651 }
1652
1653 static __always_inline u64 vmcs_read64(unsigned long field)
1654 {
1655         vmcs_check64(field);
1656 #ifdef CONFIG_X86_64
1657         return __vmcs_readl(field);
1658 #else
1659         return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
1660 #endif
1661 }
1662
1663 static __always_inline unsigned long vmcs_readl(unsigned long field)
1664 {
1665         vmcs_checkl(field);
1666         return __vmcs_readl(field);
1667 }
1668
1669 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1670 {
1671         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1672                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1673         dump_stack();
1674 }
1675
1676 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
1677 {
1678         u8 error;
1679
1680         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1681                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1682         if (unlikely(error))
1683                 vmwrite_error(field, value);
1684 }
1685
1686 static __always_inline void vmcs_write16(unsigned long field, u16 value)
1687 {
1688         vmcs_check16(field);
1689         __vmcs_writel(field, value);
1690 }
1691
1692 static __always_inline void vmcs_write32(unsigned long field, u32 value)
1693 {
1694         vmcs_check32(field);
1695         __vmcs_writel(field, value);
1696 }
1697
1698 static __always_inline void vmcs_write64(unsigned long field, u64 value)
1699 {
1700         vmcs_check64(field);
1701         __vmcs_writel(field, value);
1702 #ifndef CONFIG_X86_64
1703         asm volatile ("");
1704         __vmcs_writel(field+1, value >> 32);
1705 #endif
1706 }
1707
1708 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
1709 {
1710         vmcs_checkl(field);
1711         __vmcs_writel(field, value);
1712 }
1713
1714 static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
1715 {
1716         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1717                          "vmcs_clear_bits does not support 64-bit fields");
1718         __vmcs_writel(field, __vmcs_readl(field) & ~mask);
1719 }
1720
1721 static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
1722 {
1723         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1724                          "vmcs_set_bits does not support 64-bit fields");
1725         __vmcs_writel(field, __vmcs_readl(field) | mask);
1726 }
1727
1728 static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
1729 {
1730         vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
1731 }
1732
1733 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1734 {
1735         vmcs_write32(VM_ENTRY_CONTROLS, val);
1736         vmx->vm_entry_controls_shadow = val;
1737 }
1738
1739 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1740 {
1741         if (vmx->vm_entry_controls_shadow != val)
1742                 vm_entry_controls_init(vmx, val);
1743 }
1744
1745 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1746 {
1747         return vmx->vm_entry_controls_shadow;
1748 }
1749
1750
1751 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1752 {
1753         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1754 }
1755
1756 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1757 {
1758         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1759 }
1760
1761 static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
1762 {
1763         vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
1764 }
1765
1766 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1767 {
1768         vmcs_write32(VM_EXIT_CONTROLS, val);
1769         vmx->vm_exit_controls_shadow = val;
1770 }
1771
1772 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1773 {
1774         if (vmx->vm_exit_controls_shadow != val)
1775                 vm_exit_controls_init(vmx, val);
1776 }
1777
1778 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1779 {
1780         return vmx->vm_exit_controls_shadow;
1781 }
1782
1783
1784 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1785 {
1786         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1787 }
1788
1789 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1790 {
1791         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1792 }
1793
1794 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1795 {
1796         vmx->segment_cache.bitmask = 0;
1797 }
1798
1799 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1800                                        unsigned field)
1801 {
1802         bool ret;
1803         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1804
1805         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1806                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1807                 vmx->segment_cache.bitmask = 0;
1808         }
1809         ret = vmx->segment_cache.bitmask & mask;
1810         vmx->segment_cache.bitmask |= mask;
1811         return ret;
1812 }
1813
1814 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1815 {
1816         u16 *p = &vmx->segment_cache.seg[seg].selector;
1817
1818         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1819                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1820         return *p;
1821 }
1822
1823 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1824 {
1825         ulong *p = &vmx->segment_cache.seg[seg].base;
1826
1827         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1828                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1829         return *p;
1830 }
1831
1832 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1833 {
1834         u32 *p = &vmx->segment_cache.seg[seg].limit;
1835
1836         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1837                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1838         return *p;
1839 }
1840
1841 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1842 {
1843         u32 *p = &vmx->segment_cache.seg[seg].ar;
1844
1845         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1846                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1847         return *p;
1848 }
1849
1850 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1851 {
1852         u32 eb;
1853
1854         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1855              (1u << DB_VECTOR) | (1u << AC_VECTOR);
1856         if ((vcpu->guest_debug &
1857              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1858             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1859                 eb |= 1u << BP_VECTOR;
1860         if (to_vmx(vcpu)->rmode.vm86_active)
1861                 eb = ~0;
1862         if (enable_ept)
1863                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1864
1865         /* When we are running a nested L2 guest and L1 specified for it a
1866          * certain exception bitmap, we must trap the same exceptions and pass
1867          * them to L1. When running L2, we will only handle the exceptions
1868          * specified above if L1 did not want them.
1869          */
1870         if (is_guest_mode(vcpu))
1871                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1872
1873         vmcs_write32(EXCEPTION_BITMAP, eb);
1874 }
1875
1876 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1877                 unsigned long entry, unsigned long exit)
1878 {
1879         vm_entry_controls_clearbit(vmx, entry);
1880         vm_exit_controls_clearbit(vmx, exit);
1881 }
1882
1883 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1884 {
1885         unsigned i;
1886         struct msr_autoload *m = &vmx->msr_autoload;
1887
1888         switch (msr) {
1889         case MSR_EFER:
1890                 if (cpu_has_load_ia32_efer) {
1891                         clear_atomic_switch_msr_special(vmx,
1892                                         VM_ENTRY_LOAD_IA32_EFER,
1893                                         VM_EXIT_LOAD_IA32_EFER);
1894                         return;
1895                 }
1896                 break;
1897         case MSR_CORE_PERF_GLOBAL_CTRL:
1898                 if (cpu_has_load_perf_global_ctrl) {
1899                         clear_atomic_switch_msr_special(vmx,
1900                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1901                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1902                         return;
1903                 }
1904                 break;
1905         }
1906
1907         for (i = 0; i < m->nr; ++i)
1908                 if (m->guest[i].index == msr)
1909                         break;
1910
1911         if (i == m->nr)
1912                 return;
1913         --m->nr;
1914         m->guest[i] = m->guest[m->nr];
1915         m->host[i] = m->host[m->nr];
1916         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1917         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1918 }
1919
1920 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1921                 unsigned long entry, unsigned long exit,
1922                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1923                 u64 guest_val, u64 host_val)
1924 {
1925         vmcs_write64(guest_val_vmcs, guest_val);
1926         vmcs_write64(host_val_vmcs, host_val);
1927         vm_entry_controls_setbit(vmx, entry);
1928         vm_exit_controls_setbit(vmx, exit);
1929 }
1930
1931 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1932                                   u64 guest_val, u64 host_val)
1933 {
1934         unsigned i;
1935         struct msr_autoload *m = &vmx->msr_autoload;
1936
1937         switch (msr) {
1938         case MSR_EFER:
1939                 if (cpu_has_load_ia32_efer) {
1940                         add_atomic_switch_msr_special(vmx,
1941                                         VM_ENTRY_LOAD_IA32_EFER,
1942                                         VM_EXIT_LOAD_IA32_EFER,
1943                                         GUEST_IA32_EFER,
1944                                         HOST_IA32_EFER,
1945                                         guest_val, host_val);
1946                         return;
1947                 }
1948                 break;
1949         case MSR_CORE_PERF_GLOBAL_CTRL:
1950                 if (cpu_has_load_perf_global_ctrl) {
1951                         add_atomic_switch_msr_special(vmx,
1952                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1953                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1954                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1955                                         HOST_IA32_PERF_GLOBAL_CTRL,
1956                                         guest_val, host_val);
1957                         return;
1958                 }
1959                 break;
1960         case MSR_IA32_PEBS_ENABLE:
1961                 /* PEBS needs a quiescent period after being disabled (to write
1962                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1963                  * provide that period, so a CPU could write host's record into
1964                  * guest's memory.
1965                  */
1966                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1967         }
1968
1969         for (i = 0; i < m->nr; ++i)
1970                 if (m->guest[i].index == msr)
1971                         break;
1972
1973         if (i == NR_AUTOLOAD_MSRS) {
1974                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1975                                 "Can't add msr %x\n", msr);
1976                 return;
1977         } else if (i == m->nr) {
1978                 ++m->nr;
1979                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1980                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1981         }
1982
1983         m->guest[i].index = msr;
1984         m->guest[i].value = guest_val;
1985         m->host[i].index = msr;
1986         m->host[i].value = host_val;
1987 }
1988
1989 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1990 {
1991         u64 guest_efer = vmx->vcpu.arch.efer;
1992         u64 ignore_bits = 0;
1993
1994         if (!enable_ept) {
1995                 /*
1996                  * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
1997                  * host CPUID is more efficient than testing guest CPUID
1998                  * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
1999                  */
2000                 if (boot_cpu_has(X86_FEATURE_SMEP))
2001                         guest_efer |= EFER_NX;
2002                 else if (!(guest_efer & EFER_NX))
2003                         ignore_bits |= EFER_NX;
2004         }
2005
2006         /*
2007          * LMA and LME handled by hardware; SCE meaningless outside long mode.
2008          */
2009         ignore_bits |= EFER_SCE;
2010 #ifdef CONFIG_X86_64
2011         ignore_bits |= EFER_LMA | EFER_LME;
2012         /* SCE is meaningful only in long mode on Intel */
2013         if (guest_efer & EFER_LMA)
2014                 ignore_bits &= ~(u64)EFER_SCE;
2015 #endif
2016
2017         clear_atomic_switch_msr(vmx, MSR_EFER);
2018
2019         /*
2020          * On EPT, we can't emulate NX, so we must switch EFER atomically.
2021          * On CPUs that support "load IA32_EFER", always switch EFER
2022          * atomically, since it's faster than switching it manually.
2023          */
2024         if (cpu_has_load_ia32_efer ||
2025             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2026                 if (!(guest_efer & EFER_LMA))
2027                         guest_efer &= ~EFER_LME;
2028                 if (guest_efer != host_efer)
2029                         add_atomic_switch_msr(vmx, MSR_EFER,
2030                                               guest_efer, host_efer);
2031                 return false;
2032         } else {
2033                 guest_efer &= ~ignore_bits;
2034                 guest_efer |= host_efer & ignore_bits;
2035
2036                 vmx->guest_msrs[efer_offset].data = guest_efer;
2037                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2038
2039                 return true;
2040         }
2041 }
2042
2043 #ifdef CONFIG_X86_32
2044 /*
2045  * On 32-bit kernels, VM exits still load the FS and GS bases from the
2046  * VMCS rather than the segment table.  KVM uses this helper to figure
2047  * out the current bases to poke them into the VMCS before entry.
2048  */
2049 static unsigned long segment_base(u16 selector)
2050 {
2051         struct desc_struct *table;
2052         unsigned long v;
2053
2054         if (!(selector & ~SEGMENT_RPL_MASK))
2055                 return 0;
2056
2057         table = get_current_gdt_ro();
2058
2059         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2060                 u16 ldt_selector = kvm_read_ldt();
2061
2062                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2063                         return 0;
2064
2065                 table = (struct desc_struct *)segment_base(ldt_selector);
2066         }
2067         v = get_desc_base(&table[selector >> 3]);
2068         return v;
2069 }
2070 #endif
2071
2072 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2073 {
2074         struct vcpu_vmx *vmx = to_vmx(vcpu);
2075         int i;
2076
2077         if (vmx->host_state.loaded)
2078                 return;
2079
2080         vmx->host_state.loaded = 1;
2081         /*
2082          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2083          * allow segment selectors with cpl > 0 or ti == 1.
2084          */
2085         vmx->host_state.ldt_sel = kvm_read_ldt();
2086         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
2087         savesegment(fs, vmx->host_state.fs_sel);
2088         if (!(vmx->host_state.fs_sel & 7)) {
2089                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
2090                 vmx->host_state.fs_reload_needed = 0;
2091         } else {
2092                 vmcs_write16(HOST_FS_SELECTOR, 0);
2093                 vmx->host_state.fs_reload_needed = 1;
2094         }
2095         savesegment(gs, vmx->host_state.gs_sel);
2096         if (!(vmx->host_state.gs_sel & 7))
2097                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
2098         else {
2099                 vmcs_write16(HOST_GS_SELECTOR, 0);
2100                 vmx->host_state.gs_ldt_reload_needed = 1;
2101         }
2102
2103 #ifdef CONFIG_X86_64
2104         savesegment(ds, vmx->host_state.ds_sel);
2105         savesegment(es, vmx->host_state.es_sel);
2106 #endif
2107
2108 #ifdef CONFIG_X86_64
2109         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2110         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2111 #else
2112         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
2113         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2114 #endif
2115
2116 #ifdef CONFIG_X86_64
2117         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2118         if (is_long_mode(&vmx->vcpu))
2119                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2120 #endif
2121         if (boot_cpu_has(X86_FEATURE_MPX))
2122                 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2123         for (i = 0; i < vmx->save_nmsrs; ++i)
2124                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2125                                    vmx->guest_msrs[i].data,
2126                                    vmx->guest_msrs[i].mask);
2127 }
2128
2129 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2130 {
2131         if (!vmx->host_state.loaded)
2132                 return;
2133
2134         ++vmx->vcpu.stat.host_state_reload;
2135         vmx->host_state.loaded = 0;
2136 #ifdef CONFIG_X86_64
2137         if (is_long_mode(&vmx->vcpu))
2138                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2139 #endif
2140         if (vmx->host_state.gs_ldt_reload_needed) {
2141                 kvm_load_ldt(vmx->host_state.ldt_sel);
2142 #ifdef CONFIG_X86_64
2143                 load_gs_index(vmx->host_state.gs_sel);
2144 #else
2145                 loadsegment(gs, vmx->host_state.gs_sel);
2146 #endif
2147         }
2148         if (vmx->host_state.fs_reload_needed)
2149                 loadsegment(fs, vmx->host_state.fs_sel);
2150 #ifdef CONFIG_X86_64
2151         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
2152                 loadsegment(ds, vmx->host_state.ds_sel);
2153                 loadsegment(es, vmx->host_state.es_sel);
2154         }
2155 #endif
2156         invalidate_tss_limit();
2157 #ifdef CONFIG_X86_64
2158         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2159 #endif
2160         if (vmx->host_state.msr_host_bndcfgs)
2161                 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2162         load_fixmap_gdt(raw_smp_processor_id());
2163 }
2164
2165 static void vmx_load_host_state(struct vcpu_vmx *vmx)
2166 {
2167         preempt_disable();
2168         __vmx_load_host_state(vmx);
2169         preempt_enable();
2170 }
2171
2172 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2173 {
2174         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2175         struct pi_desc old, new;
2176         unsigned int dest;
2177
2178         /*
2179          * In case of hot-plug or hot-unplug, we may have to undo
2180          * vmx_vcpu_pi_put even if there is no assigned device.  And we
2181          * always keep PI.NDST up to date for simplicity: it makes the
2182          * code easier, and CPU migration is not a fast path.
2183          */
2184         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2185                 return;
2186
2187         /*
2188          * First handle the simple case where no cmpxchg is necessary; just
2189          * allow posting non-urgent interrupts.
2190          *
2191          * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2192          * PI.NDST: pi_post_block will do it for us and the wakeup_handler
2193          * expects the VCPU to be on the blocked_vcpu_list that matches
2194          * PI.NDST.
2195          */
2196         if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
2197             vcpu->cpu == cpu) {
2198                 pi_clear_sn(pi_desc);
2199                 return;
2200         }
2201
2202         /* The full case.  */
2203         do {
2204                 old.control = new.control = pi_desc->control;
2205
2206                 dest = cpu_physical_id(cpu);
2207
2208                 if (x2apic_enabled())
2209                         new.ndst = dest;
2210                 else
2211                         new.ndst = (dest << 8) & 0xFF00;
2212
2213                 new.sn = 0;
2214         } while (cmpxchg64(&pi_desc->control, old.control,
2215                            new.control) != old.control);
2216 }
2217
2218 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
2219 {
2220         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
2221         vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
2222 }
2223
2224 /*
2225  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2226  * vcpu mutex is already taken.
2227  */
2228 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2229 {
2230         struct vcpu_vmx *vmx = to_vmx(vcpu);
2231         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2232
2233         if (!already_loaded) {
2234                 loaded_vmcs_clear(vmx->loaded_vmcs);
2235                 local_irq_disable();
2236                 crash_disable_local_vmclear(cpu);
2237
2238                 /*
2239                  * Read loaded_vmcs->cpu should be before fetching
2240                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
2241                  * See the comments in __loaded_vmcs_clear().
2242                  */
2243                 smp_rmb();
2244
2245                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2246                          &per_cpu(loaded_vmcss_on_cpu, cpu));
2247                 crash_enable_local_vmclear(cpu);
2248                 local_irq_enable();
2249         }
2250
2251         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2252                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2253                 vmcs_load(vmx->loaded_vmcs->vmcs);
2254         }
2255
2256         if (!already_loaded) {
2257                 void *gdt = get_current_gdt_ro();
2258                 unsigned long sysenter_esp;
2259
2260                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2261
2262                 /*
2263                  * Linux uses per-cpu TSS and GDT, so set these when switching
2264                  * processors.  See 22.2.4.
2265                  */
2266                 vmcs_writel(HOST_TR_BASE,
2267                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2268                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
2269
2270                 /*
2271                  * VM exits change the host TR limit to 0x67 after a VM
2272                  * exit.  This is okay, since 0x67 covers everything except
2273                  * the IO bitmap and have have code to handle the IO bitmap
2274                  * being lost after a VM exit.
2275                  */
2276                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2277
2278                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2279                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2280
2281                 vmx->loaded_vmcs->cpu = cpu;
2282         }
2283
2284         /* Setup TSC multiplier */
2285         if (kvm_has_tsc_control &&
2286             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2287                 decache_tsc_multiplier(vmx);
2288
2289         vmx_vcpu_pi_load(vcpu, cpu);
2290         vmx->host_pkru = read_pkru();
2291         vmx->host_debugctlmsr = get_debugctlmsr();
2292 }
2293
2294 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2295 {
2296         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2297
2298         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2299                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
2300                 !kvm_vcpu_apicv_active(vcpu))
2301                 return;
2302
2303         /* Set SN when the vCPU is preempted */
2304         if (vcpu->preempted)
2305                 pi_set_sn(pi_desc);
2306 }
2307
2308 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2309 {
2310         vmx_vcpu_pi_put(vcpu);
2311
2312         __vmx_load_host_state(to_vmx(vcpu));
2313 }
2314
2315 static bool emulation_required(struct kvm_vcpu *vcpu)
2316 {
2317         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2318 }
2319
2320 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2321
2322 /*
2323  * Return the cr0 value that a nested guest would read. This is a combination
2324  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
2325  * its hypervisor (cr0_read_shadow).
2326  */
2327 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
2328 {
2329         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
2330                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
2331 }
2332 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2333 {
2334         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
2335                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2336 }
2337
2338 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2339 {
2340         unsigned long rflags, save_rflags;
2341
2342         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2343                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2344                 rflags = vmcs_readl(GUEST_RFLAGS);
2345                 if (to_vmx(vcpu)->rmode.vm86_active) {
2346                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2347                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2348                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2349                 }
2350                 to_vmx(vcpu)->rflags = rflags;
2351         }
2352         return to_vmx(vcpu)->rflags;
2353 }
2354
2355 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2356 {
2357         unsigned long old_rflags = vmx_get_rflags(vcpu);
2358
2359         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2360         to_vmx(vcpu)->rflags = rflags;
2361         if (to_vmx(vcpu)->rmode.vm86_active) {
2362                 to_vmx(vcpu)->rmode.save_rflags = rflags;
2363                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2364         }
2365         vmcs_writel(GUEST_RFLAGS, rflags);
2366
2367         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
2368                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
2369 }
2370
2371 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2372 {
2373         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2374         int ret = 0;
2375
2376         if (interruptibility & GUEST_INTR_STATE_STI)
2377                 ret |= KVM_X86_SHADOW_INT_STI;
2378         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2379                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2380
2381         return ret;
2382 }
2383
2384 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2385 {
2386         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2387         u32 interruptibility = interruptibility_old;
2388
2389         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2390
2391         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2392                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
2393         else if (mask & KVM_X86_SHADOW_INT_STI)
2394                 interruptibility |= GUEST_INTR_STATE_STI;
2395
2396         if ((interruptibility != interruptibility_old))
2397                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2398 }
2399
2400 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
2401 {
2402         unsigned long rip;
2403
2404         rip = kvm_rip_read(vcpu);
2405         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2406         kvm_rip_write(vcpu, rip);
2407
2408         /* skipping an emulated instruction also counts */
2409         vmx_set_interrupt_shadow(vcpu, 0);
2410 }
2411
2412 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
2413                                                unsigned long exit_qual)
2414 {
2415         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2416         unsigned int nr = vcpu->arch.exception.nr;
2417         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2418
2419         if (vcpu->arch.exception.has_error_code) {
2420                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
2421                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2422         }
2423
2424         if (kvm_exception_is_soft(nr))
2425                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2426         else
2427                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2428
2429         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
2430             vmx_get_nmi_mask(vcpu))
2431                 intr_info |= INTR_INFO_UNBLOCK_NMI;
2432
2433         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
2434 }
2435
2436 /*
2437  * KVM wants to inject page-faults which it got to the guest. This function
2438  * checks whether in a nested guest, we need to inject them to L1 or L2.
2439  */
2440 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
2441 {
2442         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2443         unsigned int nr = vcpu->arch.exception.nr;
2444
2445         if (nr == PF_VECTOR) {
2446                 if (vcpu->arch.exception.nested_apf) {
2447                         *exit_qual = vcpu->arch.apf.nested_apf_token;
2448                         return 1;
2449                 }
2450                 /*
2451                  * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
2452                  * The fix is to add the ancillary datum (CR2 or DR6) to structs
2453                  * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
2454                  * can be written only when inject_pending_event runs.  This should be
2455                  * conditional on a new capability---if the capability is disabled,
2456                  * kvm_multiple_exception would write the ancillary information to
2457                  * CR2 or DR6, for backwards ABI-compatibility.
2458                  */
2459                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
2460                                                     vcpu->arch.exception.error_code)) {
2461                         *exit_qual = vcpu->arch.cr2;
2462                         return 1;
2463                 }
2464         } else {
2465                 if (vmcs12->exception_bitmap & (1u << nr)) {
2466                         if (nr == DB_VECTOR)
2467                                 *exit_qual = vcpu->arch.dr6;
2468                         else
2469                                 *exit_qual = 0;
2470                         return 1;
2471                 }
2472         }
2473
2474         return 0;
2475 }
2476
2477 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2478 {
2479         struct vcpu_vmx *vmx = to_vmx(vcpu);
2480         unsigned nr = vcpu->arch.exception.nr;
2481         bool has_error_code = vcpu->arch.exception.has_error_code;
2482         u32 error_code = vcpu->arch.exception.error_code;
2483         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2484
2485         if (has_error_code) {
2486                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2487                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2488         }
2489
2490         if (vmx->rmode.vm86_active) {
2491                 int inc_eip = 0;
2492                 if (kvm_exception_is_soft(nr))
2493                         inc_eip = vcpu->arch.event_exit_inst_len;
2494                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2495                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2496                 return;
2497         }
2498
2499         if (kvm_exception_is_soft(nr)) {
2500                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2501                              vmx->vcpu.arch.event_exit_inst_len);
2502                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2503         } else
2504                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2505
2506         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2507 }
2508
2509 static bool vmx_rdtscp_supported(void)
2510 {
2511         return cpu_has_vmx_rdtscp();
2512 }
2513
2514 static bool vmx_invpcid_supported(void)
2515 {
2516         return cpu_has_vmx_invpcid() && enable_ept;
2517 }
2518
2519 /*
2520  * Swap MSR entry in host/guest MSR entry array.
2521  */
2522 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2523 {
2524         struct shared_msr_entry tmp;
2525
2526         tmp = vmx->guest_msrs[to];
2527         vmx->guest_msrs[to] = vmx->guest_msrs[from];
2528         vmx->guest_msrs[from] = tmp;
2529 }
2530
2531 /*
2532  * Set up the vmcs to automatically save and restore system
2533  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2534  * mode, as fiddling with msrs is very expensive.
2535  */
2536 static void setup_msrs(struct vcpu_vmx *vmx)
2537 {
2538         int save_nmsrs, index;
2539
2540         save_nmsrs = 0;
2541 #ifdef CONFIG_X86_64
2542         if (is_long_mode(&vmx->vcpu)) {
2543                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2544                 if (index >= 0)
2545                         move_msr_up(vmx, index, save_nmsrs++);
2546                 index = __find_msr_index(vmx, MSR_LSTAR);
2547                 if (index >= 0)
2548                         move_msr_up(vmx, index, save_nmsrs++);
2549                 index = __find_msr_index(vmx, MSR_CSTAR);
2550                 if (index >= 0)
2551                         move_msr_up(vmx, index, save_nmsrs++);
2552                 index = __find_msr_index(vmx, MSR_TSC_AUX);
2553                 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2554                         move_msr_up(vmx, index, save_nmsrs++);
2555                 /*
2556                  * MSR_STAR is only needed on long mode guests, and only
2557                  * if efer.sce is enabled.
2558                  */
2559                 index = __find_msr_index(vmx, MSR_STAR);
2560                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2561                         move_msr_up(vmx, index, save_nmsrs++);
2562         }
2563 #endif
2564         index = __find_msr_index(vmx, MSR_EFER);
2565         if (index >= 0 && update_transition_efer(vmx, index))
2566                 move_msr_up(vmx, index, save_nmsrs++);
2567
2568         vmx->save_nmsrs = save_nmsrs;
2569
2570         if (cpu_has_vmx_msr_bitmap())
2571                 vmx_update_msr_bitmap(&vmx->vcpu);
2572 }
2573
2574 /*
2575  * reads and returns guest's timestamp counter "register"
2576  * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
2577  * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
2578  */
2579 static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
2580 {
2581         u64 host_tsc, tsc_offset;
2582
2583         host_tsc = rdtsc();
2584         tsc_offset = vmcs_read64(TSC_OFFSET);
2585         return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
2586 }
2587
2588 /*
2589  * writes 'offset' into guest's timestamp counter offset register
2590  */
2591 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2592 {
2593         if (is_guest_mode(vcpu)) {
2594                 /*
2595                  * We're here if L1 chose not to trap WRMSR to TSC. According
2596                  * to the spec, this should set L1's TSC; The offset that L1
2597                  * set for L2 remains unchanged, and still needs to be added
2598                  * to the newly set TSC to get L2's TSC.
2599                  */
2600                 struct vmcs12 *vmcs12;
2601                 /* recalculate vmcs02.TSC_OFFSET: */
2602                 vmcs12 = get_vmcs12(vcpu);
2603                 vmcs_write64(TSC_OFFSET, offset +
2604                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2605                          vmcs12->tsc_offset : 0));
2606         } else {
2607                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2608                                            vmcs_read64(TSC_OFFSET), offset);
2609                 vmcs_write64(TSC_OFFSET, offset);
2610         }
2611 }
2612
2613 /*
2614  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2615  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2616  * all guests if the "nested" module option is off, and can also be disabled
2617  * for a single guest by disabling its VMX cpuid bit.
2618  */
2619 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2620 {
2621         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2622 }
2623
2624 /*
2625  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2626  * returned for the various VMX controls MSRs when nested VMX is enabled.
2627  * The same values should also be used to verify that vmcs12 control fields are
2628  * valid during nested entry from L1 to L2.
2629  * Each of these control msrs has a low and high 32-bit half: A low bit is on
2630  * if the corresponding bit in the (32-bit) control field *must* be on, and a
2631  * bit in the high half is on if the corresponding bit in the control field
2632  * may be on. See also vmx_control_verify().
2633  */
2634 static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2635 {
2636         /*
2637          * Note that as a general rule, the high half of the MSRs (bits in
2638          * the control fields which may be 1) should be initialized by the
2639          * intersection of the underlying hardware's MSR (i.e., features which
2640          * can be supported) and the list of features we want to expose -
2641          * because they are known to be properly supported in our code.
2642          * Also, usually, the low half of the MSRs (bits which must be 1) can
2643          * be set to 0, meaning that L1 may turn off any of these bits. The
2644          * reason is that if one of these bits is necessary, it will appear
2645          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2646          * fields of vmcs01 and vmcs02, will turn these bits off - and
2647          * nested_vmx_exit_reflected() will not pass related exits to L1.
2648          * These rules have exceptions below.
2649          */
2650
2651         /* pin-based controls */
2652         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2653                 vmx->nested.nested_vmx_pinbased_ctls_low,
2654                 vmx->nested.nested_vmx_pinbased_ctls_high);
2655         vmx->nested.nested_vmx_pinbased_ctls_low |=
2656                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2657         vmx->nested.nested_vmx_pinbased_ctls_high &=
2658                 PIN_BASED_EXT_INTR_MASK |
2659                 PIN_BASED_NMI_EXITING |
2660                 PIN_BASED_VIRTUAL_NMIS;
2661         vmx->nested.nested_vmx_pinbased_ctls_high |=
2662                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2663                 PIN_BASED_VMX_PREEMPTION_TIMER;
2664         if (kvm_vcpu_apicv_active(&vmx->vcpu))
2665                 vmx->nested.nested_vmx_pinbased_ctls_high |=
2666                         PIN_BASED_POSTED_INTR;
2667
2668         /* exit controls */
2669         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2670                 vmx->nested.nested_vmx_exit_ctls_low,
2671                 vmx->nested.nested_vmx_exit_ctls_high);
2672         vmx->nested.nested_vmx_exit_ctls_low =
2673                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2674
2675         vmx->nested.nested_vmx_exit_ctls_high &=
2676 #ifdef CONFIG_X86_64
2677                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2678 #endif
2679                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2680         vmx->nested.nested_vmx_exit_ctls_high |=
2681                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2682                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2683                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2684
2685         if (kvm_mpx_supported())
2686                 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2687
2688         /* We support free control of debug control saving. */
2689         vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2690
2691         /* entry controls */
2692         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2693                 vmx->nested.nested_vmx_entry_ctls_low,
2694                 vmx->nested.nested_vmx_entry_ctls_high);
2695         vmx->nested.nested_vmx_entry_ctls_low =
2696                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2697         vmx->nested.nested_vmx_entry_ctls_high &=
2698 #ifdef CONFIG_X86_64
2699                 VM_ENTRY_IA32E_MODE |
2700 #endif
2701                 VM_ENTRY_LOAD_IA32_PAT;
2702         vmx->nested.nested_vmx_entry_ctls_high |=
2703                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2704         if (kvm_mpx_supported())
2705                 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2706
2707         /* We support free control of debug control loading. */
2708         vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2709
2710         /* cpu-based controls */
2711         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2712                 vmx->nested.nested_vmx_procbased_ctls_low,
2713                 vmx->nested.nested_vmx_procbased_ctls_high);
2714         vmx->nested.nested_vmx_procbased_ctls_low =
2715                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2716         vmx->nested.nested_vmx_procbased_ctls_high &=
2717                 CPU_BASED_VIRTUAL_INTR_PENDING |
2718                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2719                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2720                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2721                 CPU_BASED_CR3_STORE_EXITING |
2722 #ifdef CONFIG_X86_64
2723                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2724 #endif
2725                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2726                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
2727                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
2728                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
2729                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2730         /*
2731          * We can allow some features even when not supported by the
2732          * hardware. For example, L1 can specify an MSR bitmap - and we
2733          * can use it to avoid exits to L1 - even when L0 runs L2
2734          * without MSR bitmaps.
2735          */
2736         vmx->nested.nested_vmx_procbased_ctls_high |=
2737                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2738                 CPU_BASED_USE_MSR_BITMAPS;
2739
2740         /* We support free control of CR3 access interception. */
2741         vmx->nested.nested_vmx_procbased_ctls_low &=
2742                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2743
2744         /*
2745          * secondary cpu-based controls.  Do not include those that
2746          * depend on CPUID bits, they are added later by vmx_cpuid_update.
2747          */
2748         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2749                 vmx->nested.nested_vmx_secondary_ctls_low,
2750                 vmx->nested.nested_vmx_secondary_ctls_high);
2751         vmx->nested.nested_vmx_secondary_ctls_low = 0;
2752         vmx->nested.nested_vmx_secondary_ctls_high &=
2753                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2754                 SECONDARY_EXEC_DESC |
2755                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2756                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2757                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2758                 SECONDARY_EXEC_WBINVD_EXITING;
2759
2760         if (enable_ept) {
2761                 /* nested EPT: emulate EPT also to L1 */
2762                 vmx->nested.nested_vmx_secondary_ctls_high |=
2763                         SECONDARY_EXEC_ENABLE_EPT;
2764                 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2765                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2766                 if (cpu_has_vmx_ept_execute_only())
2767                         vmx->nested.nested_vmx_ept_caps |=
2768                                 VMX_EPT_EXECUTE_ONLY_BIT;
2769                 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2770                 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2771                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
2772                         VMX_EPT_1GB_PAGE_BIT;
2773                 if (enable_ept_ad_bits) {
2774                         vmx->nested.nested_vmx_secondary_ctls_high |=
2775                                 SECONDARY_EXEC_ENABLE_PML;
2776                         vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2777                 }
2778         }
2779
2780         if (cpu_has_vmx_vmfunc()) {
2781                 vmx->nested.nested_vmx_secondary_ctls_high |=
2782                         SECONDARY_EXEC_ENABLE_VMFUNC;
2783                 /*
2784                  * Advertise EPTP switching unconditionally
2785                  * since we emulate it
2786                  */
2787                 if (enable_ept)
2788                         vmx->nested.nested_vmx_vmfunc_controls =
2789                                 VMX_VMFUNC_EPTP_SWITCHING;
2790         }
2791
2792         /*
2793          * Old versions of KVM use the single-context version without
2794          * checking for support, so declare that it is supported even
2795          * though it is treated as global context.  The alternative is
2796          * not failing the single-context invvpid, and it is worse.
2797          */
2798         if (enable_vpid) {
2799                 vmx->nested.nested_vmx_secondary_ctls_high |=
2800                         SECONDARY_EXEC_ENABLE_VPID;
2801                 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2802                         VMX_VPID_EXTENT_SUPPORTED_MASK;
2803         }
2804
2805         if (enable_unrestricted_guest)
2806                 vmx->nested.nested_vmx_secondary_ctls_high |=
2807                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
2808
2809         /* miscellaneous data */
2810         rdmsr(MSR_IA32_VMX_MISC,
2811                 vmx->nested.nested_vmx_misc_low,
2812                 vmx->nested.nested_vmx_misc_high);
2813         vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2814         vmx->nested.nested_vmx_misc_low |=
2815                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2816                 VMX_MISC_ACTIVITY_HLT;
2817         vmx->nested.nested_vmx_misc_high = 0;
2818
2819         /*
2820          * This MSR reports some information about VMX support. We
2821          * should return information about the VMX we emulate for the
2822          * guest, and the VMCS structure we give it - not about the
2823          * VMX support of the underlying hardware.
2824          */
2825         vmx->nested.nested_vmx_basic =
2826                 VMCS12_REVISION |
2827                 VMX_BASIC_TRUE_CTLS |
2828                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2829                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2830
2831         if (cpu_has_vmx_basic_inout())
2832                 vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
2833
2834         /*
2835          * These MSRs specify bits which the guest must keep fixed on
2836          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2837          * We picked the standard core2 setting.
2838          */
2839 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2840 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
2841         vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
2842         vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
2843
2844         /* These MSRs specify bits which the guest must keep fixed off. */
2845         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
2846         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
2847
2848         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2849         vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
2850 }
2851
2852 /*
2853  * if fixed0[i] == 1: val[i] must be 1
2854  * if fixed1[i] == 0: val[i] must be 0
2855  */
2856 static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
2857 {
2858         return ((val & fixed1) | fixed0) == val;
2859 }
2860
2861 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2862 {
2863         return fixed_bits_valid(control, low, high);
2864 }
2865
2866 static inline u64 vmx_control_msr(u32 low, u32 high)
2867 {
2868         return low | ((u64)high << 32);
2869 }
2870
2871 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
2872 {
2873         superset &= mask;
2874         subset &= mask;
2875
2876         return (superset | subset) == superset;
2877 }
2878
2879 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
2880 {
2881         const u64 feature_and_reserved =
2882                 /* feature (except bit 48; see below) */
2883                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
2884                 /* reserved */
2885                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
2886         u64 vmx_basic = vmx->nested.nested_vmx_basic;
2887
2888         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
2889                 return -EINVAL;
2890
2891         /*
2892          * KVM does not emulate a version of VMX that constrains physical
2893          * addresses of VMX structures (e.g. VMCS) to 32-bits.
2894          */
2895         if (data & BIT_ULL(48))
2896                 return -EINVAL;
2897
2898         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
2899             vmx_basic_vmcs_revision_id(data))
2900                 return -EINVAL;
2901
2902         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
2903                 return -EINVAL;
2904
2905         vmx->nested.nested_vmx_basic = data;
2906         return 0;
2907 }
2908
2909 static int
2910 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
2911 {
2912         u64 supported;
2913         u32 *lowp, *highp;
2914
2915         switch (msr_index) {
2916         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2917                 lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
2918                 highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
2919                 break;
2920         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2921                 lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
2922                 highp = &vmx->nested.nested_vmx_procbased_ctls_high;
2923                 break;
2924         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2925                 lowp = &vmx->nested.nested_vmx_exit_ctls_low;
2926                 highp = &vmx->nested.nested_vmx_exit_ctls_high;
2927                 break;
2928         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2929                 lowp = &vmx->nested.nested_vmx_entry_ctls_low;
2930                 highp = &vmx->nested.nested_vmx_entry_ctls_high;
2931                 break;
2932         case MSR_IA32_VMX_PROCBASED_CTLS2:
2933                 lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
2934                 highp = &vmx->nested.nested_vmx_secondary_ctls_high;
2935                 break;
2936         default:
2937                 BUG();
2938         }
2939
2940         supported = vmx_control_msr(*lowp, *highp);
2941
2942         /* Check must-be-1 bits are still 1. */
2943         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
2944                 return -EINVAL;
2945
2946         /* Check must-be-0 bits are still 0. */
2947         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
2948                 return -EINVAL;
2949
2950         *lowp = data;
2951         *highp = data >> 32;
2952         return 0;
2953 }
2954
2955 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
2956 {
2957         const u64 feature_and_reserved_bits =
2958                 /* feature */
2959                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
2960                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
2961                 /* reserved */
2962                 GENMASK_ULL(13, 9) | BIT_ULL(31);
2963         u64 vmx_misc;
2964
2965         vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
2966                                    vmx->nested.nested_vmx_misc_high);
2967
2968         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
2969                 return -EINVAL;
2970
2971         if ((vmx->nested.nested_vmx_pinbased_ctls_high &
2972              PIN_BASED_VMX_PREEMPTION_TIMER) &&
2973             vmx_misc_preemption_timer_rate(data) !=
2974             vmx_misc_preemption_timer_rate(vmx_misc))
2975                 return -EINVAL;
2976
2977         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
2978                 return -EINVAL;
2979
2980         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
2981                 return -EINVAL;
2982
2983         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
2984                 return -EINVAL;
2985
2986         vmx->nested.nested_vmx_misc_low = data;
2987         vmx->nested.nested_vmx_misc_high = data >> 32;
2988         return 0;
2989 }
2990
2991 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
2992 {
2993         u64 vmx_ept_vpid_cap;
2994
2995         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
2996                                            vmx->nested.nested_vmx_vpid_caps);
2997
2998         /* Every bit is either reserved or a feature bit. */
2999         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3000                 return -EINVAL;
3001
3002         vmx->nested.nested_vmx_ept_caps = data;
3003         vmx->nested.nested_vmx_vpid_caps = data >> 32;
3004         return 0;
3005 }
3006
3007 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3008 {
3009         u64 *msr;
3010
3011         switch (msr_index) {
3012         case MSR_IA32_VMX_CR0_FIXED0:
3013                 msr = &vmx->nested.nested_vmx_cr0_fixed0;
3014                 break;
3015         case MSR_IA32_VMX_CR4_FIXED0:
3016                 msr = &vmx->nested.nested_vmx_cr4_fixed0;
3017                 break;
3018         default:
3019                 BUG();
3020         }
3021
3022         /*
3023          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3024          * must be 1 in the restored value.
3025          */
3026         if (!is_bitwise_subset(data, *msr, -1ULL))
3027                 return -EINVAL;
3028
3029         *msr = data;
3030         return 0;
3031 }
3032
3033 /*
3034  * Called when userspace is restoring VMX MSRs.
3035  *
3036  * Returns 0 on success, non-0 otherwise.
3037  */
3038 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3039 {
3040         struct vcpu_vmx *vmx = to_vmx(vcpu);
3041
3042         switch (msr_index) {
3043         case MSR_IA32_VMX_BASIC:
3044                 return vmx_restore_vmx_basic(vmx, data);
3045         case MSR_IA32_VMX_PINBASED_CTLS:
3046         case MSR_IA32_VMX_PROCBASED_CTLS:
3047         case MSR_IA32_VMX_EXIT_CTLS:
3048         case MSR_IA32_VMX_ENTRY_CTLS:
3049                 /*
3050                  * The "non-true" VMX capability MSRs are generated from the
3051                  * "true" MSRs, so we do not support restoring them directly.
3052                  *
3053                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3054                  * should restore the "true" MSRs with the must-be-1 bits
3055                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3056                  * DEFAULT SETTINGS".
3057                  */
3058                 return -EINVAL;
3059         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3060         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3061         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3062         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3063         case MSR_IA32_VMX_PROCBASED_CTLS2:
3064                 return vmx_restore_control_msr(vmx, msr_index, data);
3065         case MSR_IA32_VMX_MISC:
3066                 return vmx_restore_vmx_misc(vmx, data);
3067         case MSR_IA32_VMX_CR0_FIXED0:
3068         case MSR_IA32_VMX_CR4_FIXED0:
3069                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3070         case MSR_IA32_VMX_CR0_FIXED1:
3071         case MSR_IA32_VMX_CR4_FIXED1:
3072                 /*
3073                  * These MSRs are generated based on the vCPU's CPUID, so we
3074                  * do not support restoring them directly.
3075                  */
3076                 return -EINVAL;
3077         case MSR_IA32_VMX_EPT_VPID_CAP:
3078                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3079         case MSR_IA32_VMX_VMCS_ENUM:
3080                 vmx->nested.nested_vmx_vmcs_enum = data;
3081                 return 0;
3082         default:
3083                 /*
3084                  * The rest of the VMX capability MSRs do not support restore.
3085                  */
3086                 return -EINVAL;
3087         }
3088 }
3089
3090 /* Returns 0 on success, non-0 otherwise. */
3091 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
3092 {
3093         struct vcpu_vmx *vmx = to_vmx(vcpu);
3094
3095         switch (msr_index) {
3096         case MSR_IA32_VMX_BASIC:
3097                 *pdata = vmx->nested.nested_vmx_basic;
3098                 break;
3099         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3100         case MSR_IA32_VMX_PINBASED_CTLS:
3101                 *pdata = vmx_control_msr(
3102                         vmx->nested.nested_vmx_pinbased_ctls_low,
3103                         vmx->nested.nested_vmx_pinbased_ctls_high);
3104                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
3105                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3106                 break;
3107         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3108         case MSR_IA32_VMX_PROCBASED_CTLS:
3109                 *pdata = vmx_control_msr(
3110                         vmx->nested.nested_vmx_procbased_ctls_low,
3111                         vmx->nested.nested_vmx_procbased_ctls_high);
3112                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
3113                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3114                 break;
3115         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3116         case MSR_IA32_VMX_EXIT_CTLS:
3117                 *pdata = vmx_control_msr(
3118                         vmx->nested.nested_vmx_exit_ctls_low,
3119                         vmx->nested.nested_vmx_exit_ctls_high);
3120                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
3121                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3122                 break;
3123         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3124         case MSR_IA32_VMX_ENTRY_CTLS:
3125                 *pdata = vmx_control_msr(
3126                         vmx->nested.nested_vmx_entry_ctls_low,
3127                         vmx->nested.nested_vmx_entry_ctls_high);
3128                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
3129                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3130                 break;
3131         case MSR_IA32_VMX_MISC:
3132                 *pdata = vmx_control_msr(
3133                         vmx->nested.nested_vmx_misc_low,
3134                         vmx->nested.nested_vmx_misc_high);
3135                 break;
3136         case MSR_IA32_VMX_CR0_FIXED0:
3137                 *pdata = vmx->nested.nested_vmx_cr0_fixed0;
3138                 break;
3139         case MSR_IA32_VMX_CR0_FIXED1:
3140                 *pdata = vmx->nested.nested_vmx_cr0_fixed1;
3141                 break;
3142         case MSR_IA32_VMX_CR4_FIXED0:
3143                 *pdata = vmx->nested.nested_vmx_cr4_fixed0;
3144                 break;
3145         case MSR_IA32_VMX_CR4_FIXED1:
3146                 *pdata = vmx->nested.nested_vmx_cr4_fixed1;
3147                 break;
3148         case MSR_IA32_VMX_VMCS_ENUM:
3149                 *pdata = vmx->nested.nested_vmx_vmcs_enum;
3150                 break;
3151         case MSR_IA32_VMX_PROCBASED_CTLS2:
3152                 *pdata = vmx_control_msr(
3153                         vmx->nested.nested_vmx_secondary_ctls_low,
3154                         vmx->nested.nested_vmx_secondary_ctls_high);
3155                 break;
3156         case MSR_IA32_VMX_EPT_VPID_CAP:
3157                 *pdata = vmx->nested.nested_vmx_ept_caps |
3158                         ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
3159                 break;
3160         case MSR_IA32_VMX_VMFUNC:
3161                 *pdata = vmx->nested.nested_vmx_vmfunc_controls;
3162                 break;
3163         default:
3164                 return 1;
3165         }
3166
3167         return 0;
3168 }
3169
3170 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3171                                                  uint64_t val)
3172 {
3173         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
3174
3175         return !(val & ~valid_bits);
3176 }
3177
3178 /*
3179  * Reads an msr value (of 'msr_index') into 'pdata'.
3180  * Returns 0 on success, non-0 otherwise.
3181  * Assumes vcpu_load() was already called.
3182  */
3183 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3184 {
3185         struct vcpu_vmx *vmx = to_vmx(vcpu);
3186         struct shared_msr_entry *msr;
3187
3188         switch (msr_info->index) {
3189 #ifdef CONFIG_X86_64
3190         case MSR_FS_BASE:
3191                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
3192                 break;
3193         case MSR_GS_BASE:
3194                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
3195                 break;
3196         case MSR_KERNEL_GS_BASE:
3197                 vmx_load_host_state(vmx);
3198                 msr_info->data = vmx->msr_guest_kernel_gs_base;
3199                 break;
3200 #endif
3201         case MSR_EFER:
3202                 return kvm_get_msr_common(vcpu, msr_info);
3203         case MSR_IA32_TSC:
3204                 msr_info->data = guest_read_tsc(vcpu);
3205                 break;
3206         case MSR_IA32_SYSENTER_CS:
3207                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
3208                 break;
3209         case MSR_IA32_SYSENTER_EIP:
3210                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
3211                 break;
3212         case MSR_IA32_SYSENTER_ESP:
3213                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
3214                 break;
3215         case MSR_IA32_BNDCFGS:
3216                 if (!kvm_mpx_supported() ||
3217                     (!msr_info->host_initiated &&
3218                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3219                         return 1;
3220                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3221                 break;
3222         case MSR_IA32_MCG_EXT_CTL:
3223                 if (!msr_info->host_initiated &&
3224                     !(vmx->msr_ia32_feature_control &
3225                       FEATURE_CONTROL_LMCE))
3226                         return 1;
3227                 msr_info->data = vcpu->arch.mcg_ext_ctl;
3228                 break;
3229         case MSR_IA32_FEATURE_CONTROL:
3230                 msr_info->data = vmx->msr_ia32_feature_control;
3231                 break;
3232         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3233                 if (!nested_vmx_allowed(vcpu))
3234                         return 1;
3235                 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
3236         case MSR_IA32_XSS:
3237                 if (!vmx_xsaves_supported())
3238                         return 1;
3239                 msr_info->data = vcpu->arch.ia32_xss;
3240                 break;
3241         case MSR_TSC_AUX:
3242                 if (!msr_info->host_initiated &&
3243                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3244                         return 1;
3245                 /* Otherwise falls through */
3246         default:
3247                 msr = find_msr_entry(vmx, msr_info->index);
3248                 if (msr) {
3249                         msr_info->data = msr->data;
3250                         break;
3251                 }
3252                 return kvm_get_msr_common(vcpu, msr_info);
3253         }
3254
3255         return 0;
3256 }
3257
3258 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
3259
3260 /*
3261  * Writes msr value into into the appropriate "register".
3262  * Returns 0 on success, non-0 otherwise.
3263  * Assumes vcpu_load() was already called.
3264  */
3265 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3266 {
3267         struct vcpu_vmx *vmx = to_vmx(vcpu);
3268         struct shared_msr_entry *msr;
3269         int ret = 0;
3270         u32 msr_index = msr_info->index;
3271         u64 data = msr_info->data;
3272
3273         switch (msr_index) {
3274         case MSR_EFER:
3275                 ret = kvm_set_msr_common(vcpu, msr_info);
3276                 break;
3277 #ifdef CONFIG_X86_64
3278         case MSR_FS_BASE:
3279                 vmx_segment_cache_clear(vmx);
3280                 vmcs_writel(GUEST_FS_BASE, data);
3281                 break;
3282         case MSR_GS_BASE:
3283                 vmx_segment_cache_clear(vmx);
3284                 vmcs_writel(GUEST_GS_BASE, data);
3285                 break;
3286         case MSR_KERNEL_GS_BASE:
3287                 vmx_load_host_state(vmx);
3288                 vmx->msr_guest_kernel_gs_base = data;
3289                 break;
3290 #endif
3291         case MSR_IA32_SYSENTER_CS:
3292                 vmcs_write32(GUEST_SYSENTER_CS, data);
3293                 break;
3294         case MSR_IA32_SYSENTER_EIP:
3295                 vmcs_writel(GUEST_SYSENTER_EIP, data);
3296                 break;
3297         case MSR_IA32_SYSENTER_ESP:
3298                 vmcs_writel(GUEST_SYSENTER_ESP, data);
3299                 break;
3300         case MSR_IA32_BNDCFGS:
3301                 if (!kvm_mpx_supported() ||
3302                     (!msr_info->host_initiated &&
3303                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3304                         return 1;
3305                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
3306                     (data & MSR_IA32_BNDCFGS_RSVD))
3307                         return 1;
3308                 vmcs_write64(GUEST_BNDCFGS, data);
3309                 break;
3310         case MSR_IA32_TSC:
3311                 kvm_write_tsc(vcpu, msr_info);
3312                 break;
3313         case MSR_IA32_CR_PAT:
3314                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3315                         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3316                                 return 1;
3317                         vmcs_write64(GUEST_IA32_PAT, data);
3318                         vcpu->arch.pat = data;
3319                         break;
3320                 }
3321                 ret = kvm_set_msr_common(vcpu, msr_info);
3322                 break;
3323         case MSR_IA32_TSC_ADJUST:
3324                 ret = kvm_set_msr_common(vcpu, msr_info);
3325                 break;
3326         case MSR_IA32_MCG_EXT_CTL:
3327                 if ((!msr_info->host_initiated &&
3328                      !(to_vmx(vcpu)->msr_ia32_feature_control &
3329                        FEATURE_CONTROL_LMCE)) ||
3330                     (data & ~MCG_EXT_CTL_LMCE_EN))
3331                         return 1;
3332                 vcpu->arch.mcg_ext_ctl = data;
3333                 break;
3334         case MSR_IA32_FEATURE_CONTROL:
3335                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3336                     (to_vmx(vcpu)->msr_ia32_feature_control &
3337                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
3338                         return 1;
3339                 vmx->msr_ia32_feature_control = data;
3340                 if (msr_info->host_initiated && data == 0)
3341                         vmx_leave_nested(vcpu);
3342                 break;
3343         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3344                 if (!msr_info->host_initiated)
3345                         return 1; /* they are read-only */
3346                 if (!nested_vmx_allowed(vcpu))
3347                         return 1;
3348                 return vmx_set_vmx_msr(vcpu, msr_index, data);
3349         case MSR_IA32_XSS:
3350                 if (!vmx_xsaves_supported())
3351                         return 1;
3352                 /*
3353                  * The only supported bit as of Skylake is bit 8, but
3354                  * it is not supported on KVM.
3355                  */
3356                 if (data != 0)
3357                         return 1;
3358                 vcpu->arch.ia32_xss = data;
3359                 if (vcpu->arch.ia32_xss != host_xss)
3360                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
3361                                 vcpu->arch.ia32_xss, host_xss);
3362                 else
3363                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
3364                 break;
3365         case MSR_TSC_AUX:
3366                 if (!msr_info->host_initiated &&
3367                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3368                         return 1;
3369                 /* Check reserved bit, higher 32 bits should be zero */
3370                 if ((data >> 32) != 0)
3371                         return 1;
3372                 /* Otherwise falls through */
3373         default:
3374                 msr = find_msr_entry(vmx, msr_index);
3375                 if (msr) {
3376                         u64 old_msr_data = msr->data;
3377                         msr->data = data;
3378                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
3379                                 preempt_disable();
3380                                 ret = kvm_set_shared_msr(msr->index, msr->data,
3381                                                          msr->mask);
3382                                 preempt_enable();
3383                                 if (ret)
3384                                         msr->data = old_msr_data;
3385                         }
3386                         break;
3387                 }
3388                 ret = kvm_set_msr_common(vcpu, msr_info);
3389         }
3390
3391         return ret;
3392 }
3393
3394 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
3395 {
3396         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);