2099b1495b571319b572b760d5f164f9c7bf115b
[sfrench/cifs-2.6.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "cpuid.h"
22 #include "lapic.h"
23
24 #include <linux/kvm_host.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/mm.h>
28 #include <linux/highmem.h>
29 #include <linux/sched.h>
30 #include <linux/moduleparam.h>
31 #include <linux/mod_devicetable.h>
32 #include <linux/trace_events.h>
33 #include <linux/slab.h>
34 #include <linux/tboot.h>
35 #include <linux/hrtimer.h>
36 #include <linux/frame.h>
37 #include "kvm_cache_regs.h"
38 #include "x86.h"
39
40 #include <asm/cpu.h>
41 #include <asm/io.h>
42 #include <asm/desc.h>
43 #include <asm/vmx.h>
44 #include <asm/virtext.h>
45 #include <asm/mce.h>
46 #include <asm/fpu/internal.h>
47 #include <asm/perf_event.h>
48 #include <asm/debugreg.h>
49 #include <asm/kexec.h>
50 #include <asm/apic.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/mmu_context.h>
53
54 #include "trace.h"
55 #include "pmu.h"
56
57 #define __ex(x) __kvm_handle_fault_on_reboot(x)
58 #define __ex_clear(x, reg) \
59         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
60
61 MODULE_AUTHOR("Qumranet");
62 MODULE_LICENSE("GPL");
63
64 static const struct x86_cpu_id vmx_cpu_id[] = {
65         X86_FEATURE_MATCH(X86_FEATURE_VMX),
66         {}
67 };
68 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
69
70 static bool __read_mostly enable_vpid = 1;
71 module_param_named(vpid, enable_vpid, bool, 0444);
72
73 static bool __read_mostly flexpriority_enabled = 1;
74 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
75
76 static bool __read_mostly enable_ept = 1;
77 module_param_named(ept, enable_ept, bool, S_IRUGO);
78
79 static bool __read_mostly enable_unrestricted_guest = 1;
80 module_param_named(unrestricted_guest,
81                         enable_unrestricted_guest, bool, S_IRUGO);
82
83 static bool __read_mostly enable_ept_ad_bits = 1;
84 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
85
86 static bool __read_mostly emulate_invalid_guest_state = true;
87 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
88
89 static bool __read_mostly fasteoi = 1;
90 module_param(fasteoi, bool, S_IRUGO);
91
92 static bool __read_mostly enable_apicv = 1;
93 module_param(enable_apicv, bool, S_IRUGO);
94
95 static bool __read_mostly enable_shadow_vmcs = 1;
96 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
97 /*
98  * If nested=1, nested virtualization is supported, i.e., guests may use
99  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
100  * use VMX instructions.
101  */
102 static bool __read_mostly nested = 0;
103 module_param(nested, bool, S_IRUGO);
104
105 static u64 __read_mostly host_xss;
106
107 static bool __read_mostly enable_pml = 1;
108 module_param_named(pml, enable_pml, bool, S_IRUGO);
109
110 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
111
112 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
113 static int __read_mostly cpu_preemption_timer_multi;
114 static bool __read_mostly enable_preemption_timer = 1;
115 #ifdef CONFIG_X86_64
116 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
117 #endif
118
119 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
120 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
121 #define KVM_VM_CR0_ALWAYS_ON                                            \
122         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
123 #define KVM_CR4_GUEST_OWNED_BITS                                      \
124         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
125          | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
126
127 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
128 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
129
130 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
131
132 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
133
134 /*
135  * Hyper-V requires all of these, so mark them as supported even though
136  * they are just treated the same as all-context.
137  */
138 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
139         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
140         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
141         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
142         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
143
144 /*
145  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
146  * ple_gap:    upper bound on the amount of time between two successive
147  *             executions of PAUSE in a loop. Also indicate if ple enabled.
148  *             According to test, this time is usually smaller than 128 cycles.
149  * ple_window: upper bound on the amount of time a guest is allowed to execute
150  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
151  *             less than 2^12 cycles
152  * Time is measured based on a counter that runs at the same rate as the TSC,
153  * refer SDM volume 3b section 21.6.13 & 22.1.3.
154  */
155 #define KVM_VMX_DEFAULT_PLE_GAP           128
156 #define KVM_VMX_DEFAULT_PLE_WINDOW        4096
157 #define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
158 #define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
159 #define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
160                 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
161
162 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
163 module_param(ple_gap, int, S_IRUGO);
164
165 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
166 module_param(ple_window, int, S_IRUGO);
167
168 /* Default doubles per-vcpu window every exit. */
169 static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
170 module_param(ple_window_grow, int, S_IRUGO);
171
172 /* Default resets per-vcpu window every exit to ple_window. */
173 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
174 module_param(ple_window_shrink, int, S_IRUGO);
175
176 /* Default is to compute the maximum so we can never overflow. */
177 static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
178 static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
179 module_param(ple_window_max, int, S_IRUGO);
180
181 extern const ulong vmx_return;
182
183 #define NR_AUTOLOAD_MSRS 8
184 #define VMCS02_POOL_SIZE 1
185
186 struct vmcs {
187         u32 revision_id;
188         u32 abort;
189         char data[0];
190 };
191
192 /*
193  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
194  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
195  * loaded on this CPU (so we can clear them if the CPU goes down).
196  */
197 struct loaded_vmcs {
198         struct vmcs *vmcs;
199         struct vmcs *shadow_vmcs;
200         int cpu;
201         bool launched;
202         bool nmi_known_unmasked;
203         struct list_head loaded_vmcss_on_cpu_link;
204 };
205
206 struct shared_msr_entry {
207         unsigned index;
208         u64 data;
209         u64 mask;
210 };
211
212 /*
213  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
214  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
215  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
216  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
217  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
218  * More than one of these structures may exist, if L1 runs multiple L2 guests.
219  * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
220  * underlying hardware which will be used to run L2.
221  * This structure is packed to ensure that its layout is identical across
222  * machines (necessary for live migration).
223  * If there are changes in this struct, VMCS12_REVISION must be changed.
224  */
225 typedef u64 natural_width;
226 struct __packed vmcs12 {
227         /* According to the Intel spec, a VMCS region must start with the
228          * following two fields. Then follow implementation-specific data.
229          */
230         u32 revision_id;
231         u32 abort;
232
233         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
234         u32 padding[7]; /* room for future expansion */
235
236         u64 io_bitmap_a;
237         u64 io_bitmap_b;
238         u64 msr_bitmap;
239         u64 vm_exit_msr_store_addr;
240         u64 vm_exit_msr_load_addr;
241         u64 vm_entry_msr_load_addr;
242         u64 tsc_offset;
243         u64 virtual_apic_page_addr;
244         u64 apic_access_addr;
245         u64 posted_intr_desc_addr;
246         u64 ept_pointer;
247         u64 eoi_exit_bitmap0;
248         u64 eoi_exit_bitmap1;
249         u64 eoi_exit_bitmap2;
250         u64 eoi_exit_bitmap3;
251         u64 xss_exit_bitmap;
252         u64 guest_physical_address;
253         u64 vmcs_link_pointer;
254         u64 pml_address;
255         u64 guest_ia32_debugctl;
256         u64 guest_ia32_pat;
257         u64 guest_ia32_efer;
258         u64 guest_ia32_perf_global_ctrl;
259         u64 guest_pdptr0;
260         u64 guest_pdptr1;
261         u64 guest_pdptr2;
262         u64 guest_pdptr3;
263         u64 guest_bndcfgs;
264         u64 host_ia32_pat;
265         u64 host_ia32_efer;
266         u64 host_ia32_perf_global_ctrl;
267         u64 padding64[8]; /* room for future expansion */
268         /*
269          * To allow migration of L1 (complete with its L2 guests) between
270          * machines of different natural widths (32 or 64 bit), we cannot have
271          * unsigned long fields with no explict size. We use u64 (aliased
272          * natural_width) instead. Luckily, x86 is little-endian.
273          */
274         natural_width cr0_guest_host_mask;
275         natural_width cr4_guest_host_mask;
276         natural_width cr0_read_shadow;
277         natural_width cr4_read_shadow;
278         natural_width cr3_target_value0;
279         natural_width cr3_target_value1;
280         natural_width cr3_target_value2;
281         natural_width cr3_target_value3;
282         natural_width exit_qualification;
283         natural_width guest_linear_address;
284         natural_width guest_cr0;
285         natural_width guest_cr3;
286         natural_width guest_cr4;
287         natural_width guest_es_base;
288         natural_width guest_cs_base;
289         natural_width guest_ss_base;
290         natural_width guest_ds_base;
291         natural_width guest_fs_base;
292         natural_width guest_gs_base;
293         natural_width guest_ldtr_base;
294         natural_width guest_tr_base;
295         natural_width guest_gdtr_base;
296         natural_width guest_idtr_base;
297         natural_width guest_dr7;
298         natural_width guest_rsp;
299         natural_width guest_rip;
300         natural_width guest_rflags;
301         natural_width guest_pending_dbg_exceptions;
302         natural_width guest_sysenter_esp;
303         natural_width guest_sysenter_eip;
304         natural_width host_cr0;
305         natural_width host_cr3;
306         natural_width host_cr4;
307         natural_width host_fs_base;
308         natural_width host_gs_base;
309         natural_width host_tr_base;
310         natural_width host_gdtr_base;
311         natural_width host_idtr_base;
312         natural_width host_ia32_sysenter_esp;
313         natural_width host_ia32_sysenter_eip;
314         natural_width host_rsp;
315         natural_width host_rip;
316         natural_width paddingl[8]; /* room for future expansion */
317         u32 pin_based_vm_exec_control;
318         u32 cpu_based_vm_exec_control;
319         u32 exception_bitmap;
320         u32 page_fault_error_code_mask;
321         u32 page_fault_error_code_match;
322         u32 cr3_target_count;
323         u32 vm_exit_controls;
324         u32 vm_exit_msr_store_count;
325         u32 vm_exit_msr_load_count;
326         u32 vm_entry_controls;
327         u32 vm_entry_msr_load_count;
328         u32 vm_entry_intr_info_field;
329         u32 vm_entry_exception_error_code;
330         u32 vm_entry_instruction_len;
331         u32 tpr_threshold;
332         u32 secondary_vm_exec_control;
333         u32 vm_instruction_error;
334         u32 vm_exit_reason;
335         u32 vm_exit_intr_info;
336         u32 vm_exit_intr_error_code;
337         u32 idt_vectoring_info_field;
338         u32 idt_vectoring_error_code;
339         u32 vm_exit_instruction_len;
340         u32 vmx_instruction_info;
341         u32 guest_es_limit;
342         u32 guest_cs_limit;
343         u32 guest_ss_limit;
344         u32 guest_ds_limit;
345         u32 guest_fs_limit;
346         u32 guest_gs_limit;
347         u32 guest_ldtr_limit;
348         u32 guest_tr_limit;
349         u32 guest_gdtr_limit;
350         u32 guest_idtr_limit;
351         u32 guest_es_ar_bytes;
352         u32 guest_cs_ar_bytes;
353         u32 guest_ss_ar_bytes;
354         u32 guest_ds_ar_bytes;
355         u32 guest_fs_ar_bytes;
356         u32 guest_gs_ar_bytes;
357         u32 guest_ldtr_ar_bytes;
358         u32 guest_tr_ar_bytes;
359         u32 guest_interruptibility_info;
360         u32 guest_activity_state;
361         u32 guest_sysenter_cs;
362         u32 host_ia32_sysenter_cs;
363         u32 vmx_preemption_timer_value;
364         u32 padding32[7]; /* room for future expansion */
365         u16 virtual_processor_id;
366         u16 posted_intr_nv;
367         u16 guest_es_selector;
368         u16 guest_cs_selector;
369         u16 guest_ss_selector;
370         u16 guest_ds_selector;
371         u16 guest_fs_selector;
372         u16 guest_gs_selector;
373         u16 guest_ldtr_selector;
374         u16 guest_tr_selector;
375         u16 guest_intr_status;
376         u16 guest_pml_index;
377         u16 host_es_selector;
378         u16 host_cs_selector;
379         u16 host_ss_selector;
380         u16 host_ds_selector;
381         u16 host_fs_selector;
382         u16 host_gs_selector;
383         u16 host_tr_selector;
384 };
385
386 /*
387  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
388  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
389  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
390  */
391 #define VMCS12_REVISION 0x11e57ed0
392
393 /*
394  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
395  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
396  * current implementation, 4K are reserved to avoid future complications.
397  */
398 #define VMCS12_SIZE 0x1000
399
400 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
401 struct vmcs02_list {
402         struct list_head list;
403         gpa_t vmptr;
404         struct loaded_vmcs vmcs02;
405 };
406
407 /*
408  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
409  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
410  */
411 struct nested_vmx {
412         /* Has the level1 guest done vmxon? */
413         bool vmxon;
414         gpa_t vmxon_ptr;
415         bool pml_full;
416
417         /* The guest-physical address of the current VMCS L1 keeps for L2 */
418         gpa_t current_vmptr;
419         /*
420          * Cache of the guest's VMCS, existing outside of guest memory.
421          * Loaded from guest memory during VMPTRLD. Flushed to guest
422          * memory during VMCLEAR and VMPTRLD.
423          */
424         struct vmcs12 *cached_vmcs12;
425         /*
426          * Indicates if the shadow vmcs must be updated with the
427          * data hold by vmcs12
428          */
429         bool sync_shadow_vmcs;
430
431         /* vmcs02_list cache of VMCSs recently used to run L2 guests */
432         struct list_head vmcs02_pool;
433         int vmcs02_num;
434         bool change_vmcs01_virtual_x2apic_mode;
435         /* L2 must run next, and mustn't decide to exit to L1. */
436         bool nested_run_pending;
437         /*
438          * Guest pages referred to in vmcs02 with host-physical pointers, so
439          * we must keep them pinned while L2 runs.
440          */
441         struct page *apic_access_page;
442         struct page *virtual_apic_page;
443         struct page *pi_desc_page;
444         struct pi_desc *pi_desc;
445         bool pi_pending;
446         u16 posted_intr_nv;
447
448         unsigned long *msr_bitmap;
449
450         struct hrtimer preemption_timer;
451         bool preemption_timer_expired;
452
453         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
454         u64 vmcs01_debugctl;
455
456         u16 vpid02;
457         u16 last_vpid;
458
459         /*
460          * We only store the "true" versions of the VMX capability MSRs. We
461          * generate the "non-true" versions by setting the must-be-1 bits
462          * according to the SDM.
463          */
464         u32 nested_vmx_procbased_ctls_low;
465         u32 nested_vmx_procbased_ctls_high;
466         u32 nested_vmx_secondary_ctls_low;
467         u32 nested_vmx_secondary_ctls_high;
468         u32 nested_vmx_pinbased_ctls_low;
469         u32 nested_vmx_pinbased_ctls_high;
470         u32 nested_vmx_exit_ctls_low;
471         u32 nested_vmx_exit_ctls_high;
472         u32 nested_vmx_entry_ctls_low;
473         u32 nested_vmx_entry_ctls_high;
474         u32 nested_vmx_misc_low;
475         u32 nested_vmx_misc_high;
476         u32 nested_vmx_ept_caps;
477         u32 nested_vmx_vpid_caps;
478         u64 nested_vmx_basic;
479         u64 nested_vmx_cr0_fixed0;
480         u64 nested_vmx_cr0_fixed1;
481         u64 nested_vmx_cr4_fixed0;
482         u64 nested_vmx_cr4_fixed1;
483         u64 nested_vmx_vmcs_enum;
484 };
485
486 #define POSTED_INTR_ON  0
487 #define POSTED_INTR_SN  1
488
489 /* Posted-Interrupt Descriptor */
490 struct pi_desc {
491         u32 pir[8];     /* Posted interrupt requested */
492         union {
493                 struct {
494                                 /* bit 256 - Outstanding Notification */
495                         u16     on      : 1,
496                                 /* bit 257 - Suppress Notification */
497                                 sn      : 1,
498                                 /* bit 271:258 - Reserved */
499                                 rsvd_1  : 14;
500                                 /* bit 279:272 - Notification Vector */
501                         u8      nv;
502                                 /* bit 287:280 - Reserved */
503                         u8      rsvd_2;
504                                 /* bit 319:288 - Notification Destination */
505                         u32     ndst;
506                 };
507                 u64 control;
508         };
509         u32 rsvd[6];
510 } __aligned(64);
511
512 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
513 {
514         return test_and_set_bit(POSTED_INTR_ON,
515                         (unsigned long *)&pi_desc->control);
516 }
517
518 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
519 {
520         return test_and_clear_bit(POSTED_INTR_ON,
521                         (unsigned long *)&pi_desc->control);
522 }
523
524 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
525 {
526         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
527 }
528
529 static inline void pi_clear_sn(struct pi_desc *pi_desc)
530 {
531         return clear_bit(POSTED_INTR_SN,
532                         (unsigned long *)&pi_desc->control);
533 }
534
535 static inline void pi_set_sn(struct pi_desc *pi_desc)
536 {
537         return set_bit(POSTED_INTR_SN,
538                         (unsigned long *)&pi_desc->control);
539 }
540
541 static inline void pi_clear_on(struct pi_desc *pi_desc)
542 {
543         clear_bit(POSTED_INTR_ON,
544                   (unsigned long *)&pi_desc->control);
545 }
546
547 static inline int pi_test_on(struct pi_desc *pi_desc)
548 {
549         return test_bit(POSTED_INTR_ON,
550                         (unsigned long *)&pi_desc->control);
551 }
552
553 static inline int pi_test_sn(struct pi_desc *pi_desc)
554 {
555         return test_bit(POSTED_INTR_SN,
556                         (unsigned long *)&pi_desc->control);
557 }
558
559 struct vcpu_vmx {
560         struct kvm_vcpu       vcpu;
561         unsigned long         host_rsp;
562         u8                    fail;
563         u32                   exit_intr_info;
564         u32                   idt_vectoring_info;
565         ulong                 rflags;
566         struct shared_msr_entry *guest_msrs;
567         int                   nmsrs;
568         int                   save_nmsrs;
569         unsigned long         host_idt_base;
570 #ifdef CONFIG_X86_64
571         u64                   msr_host_kernel_gs_base;
572         u64                   msr_guest_kernel_gs_base;
573 #endif
574         u32 vm_entry_controls_shadow;
575         u32 vm_exit_controls_shadow;
576         /*
577          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
578          * non-nested (L1) guest, it always points to vmcs01. For a nested
579          * guest (L2), it points to a different VMCS.
580          */
581         struct loaded_vmcs    vmcs01;
582         struct loaded_vmcs   *loaded_vmcs;
583         bool                  __launched; /* temporary, used in vmx_vcpu_run */
584         struct msr_autoload {
585                 unsigned nr;
586                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
587                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
588         } msr_autoload;
589         struct {
590                 int           loaded;
591                 u16           fs_sel, gs_sel, ldt_sel;
592 #ifdef CONFIG_X86_64
593                 u16           ds_sel, es_sel;
594 #endif
595                 int           gs_ldt_reload_needed;
596                 int           fs_reload_needed;
597                 u64           msr_host_bndcfgs;
598                 unsigned long vmcs_host_cr3;    /* May not match real cr3 */
599                 unsigned long vmcs_host_cr4;    /* May not match real cr4 */
600         } host_state;
601         struct {
602                 int vm86_active;
603                 ulong save_rflags;
604                 struct kvm_segment segs[8];
605         } rmode;
606         struct {
607                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
608                 struct kvm_save_segment {
609                         u16 selector;
610                         unsigned long base;
611                         u32 limit;
612                         u32 ar;
613                 } seg[8];
614         } segment_cache;
615         int vpid;
616         bool emulation_required;
617
618         u32 exit_reason;
619
620         /* Posted interrupt descriptor */
621         struct pi_desc pi_desc;
622
623         /* Support for a guest hypervisor (nested VMX) */
624         struct nested_vmx nested;
625
626         /* Dynamic PLE window. */
627         int ple_window;
628         bool ple_window_dirty;
629
630         /* Support for PML */
631 #define PML_ENTITY_NUM          512
632         struct page *pml_pg;
633
634         /* apic deadline value in host tsc */
635         u64 hv_deadline_tsc;
636
637         u64 current_tsc_ratio;
638
639         bool guest_pkru_valid;
640         u32 guest_pkru;
641         u32 host_pkru;
642
643         /*
644          * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
645          * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
646          * in msr_ia32_feature_control_valid_bits.
647          */
648         u64 msr_ia32_feature_control;
649         u64 msr_ia32_feature_control_valid_bits;
650 };
651
652 enum segment_cache_field {
653         SEG_FIELD_SEL = 0,
654         SEG_FIELD_BASE = 1,
655         SEG_FIELD_LIMIT = 2,
656         SEG_FIELD_AR = 3,
657
658         SEG_FIELD_NR = 4
659 };
660
661 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
662 {
663         return container_of(vcpu, struct vcpu_vmx, vcpu);
664 }
665
666 static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
667 {
668         return &(to_vmx(vcpu)->pi_desc);
669 }
670
671 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
672 #define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
673 #define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
674                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
675
676
677 static unsigned long shadow_read_only_fields[] = {
678         /*
679          * We do NOT shadow fields that are modified when L0
680          * traps and emulates any vmx instruction (e.g. VMPTRLD,
681          * VMXON...) executed by L1.
682          * For example, VM_INSTRUCTION_ERROR is read
683          * by L1 if a vmx instruction fails (part of the error path).
684          * Note the code assumes this logic. If for some reason
685          * we start shadowing these fields then we need to
686          * force a shadow sync when L0 emulates vmx instructions
687          * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
688          * by nested_vmx_failValid)
689          */
690         VM_EXIT_REASON,
691         VM_EXIT_INTR_INFO,
692         VM_EXIT_INSTRUCTION_LEN,
693         IDT_VECTORING_INFO_FIELD,
694         IDT_VECTORING_ERROR_CODE,
695         VM_EXIT_INTR_ERROR_CODE,
696         EXIT_QUALIFICATION,
697         GUEST_LINEAR_ADDRESS,
698         GUEST_PHYSICAL_ADDRESS
699 };
700 static int max_shadow_read_only_fields =
701         ARRAY_SIZE(shadow_read_only_fields);
702
703 static unsigned long shadow_read_write_fields[] = {
704         TPR_THRESHOLD,
705         GUEST_RIP,
706         GUEST_RSP,
707         GUEST_CR0,
708         GUEST_CR3,
709         GUEST_CR4,
710         GUEST_INTERRUPTIBILITY_INFO,
711         GUEST_RFLAGS,
712         GUEST_CS_SELECTOR,
713         GUEST_CS_AR_BYTES,
714         GUEST_CS_LIMIT,
715         GUEST_CS_BASE,
716         GUEST_ES_BASE,
717         GUEST_BNDCFGS,
718         CR0_GUEST_HOST_MASK,
719         CR0_READ_SHADOW,
720         CR4_READ_SHADOW,
721         TSC_OFFSET,
722         EXCEPTION_BITMAP,
723         CPU_BASED_VM_EXEC_CONTROL,
724         VM_ENTRY_EXCEPTION_ERROR_CODE,
725         VM_ENTRY_INTR_INFO_FIELD,
726         VM_ENTRY_INSTRUCTION_LEN,
727         VM_ENTRY_EXCEPTION_ERROR_CODE,
728         HOST_FS_BASE,
729         HOST_GS_BASE,
730         HOST_FS_SELECTOR,
731         HOST_GS_SELECTOR
732 };
733 static int max_shadow_read_write_fields =
734         ARRAY_SIZE(shadow_read_write_fields);
735
736 static const unsigned short vmcs_field_to_offset_table[] = {
737         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
738         FIELD(POSTED_INTR_NV, posted_intr_nv),
739         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
740         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
741         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
742         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
743         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
744         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
745         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
746         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
747         FIELD(GUEST_INTR_STATUS, guest_intr_status),
748         FIELD(GUEST_PML_INDEX, guest_pml_index),
749         FIELD(HOST_ES_SELECTOR, host_es_selector),
750         FIELD(HOST_CS_SELECTOR, host_cs_selector),
751         FIELD(HOST_SS_SELECTOR, host_ss_selector),
752         FIELD(HOST_DS_SELECTOR, host_ds_selector),
753         FIELD(HOST_FS_SELECTOR, host_fs_selector),
754         FIELD(HOST_GS_SELECTOR, host_gs_selector),
755         FIELD(HOST_TR_SELECTOR, host_tr_selector),
756         FIELD64(IO_BITMAP_A, io_bitmap_a),
757         FIELD64(IO_BITMAP_B, io_bitmap_b),
758         FIELD64(MSR_BITMAP, msr_bitmap),
759         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
760         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
761         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
762         FIELD64(TSC_OFFSET, tsc_offset),
763         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
764         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
765         FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
766         FIELD64(EPT_POINTER, ept_pointer),
767         FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
768         FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
769         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
770         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
771         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
772         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
773         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
774         FIELD64(PML_ADDRESS, pml_address),
775         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
776         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
777         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
778         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
779         FIELD64(GUEST_PDPTR0, guest_pdptr0),
780         FIELD64(GUEST_PDPTR1, guest_pdptr1),
781         FIELD64(GUEST_PDPTR2, guest_pdptr2),
782         FIELD64(GUEST_PDPTR3, guest_pdptr3),
783         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
784         FIELD64(HOST_IA32_PAT, host_ia32_pat),
785         FIELD64(HOST_IA32_EFER, host_ia32_efer),
786         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
787         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
788         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
789         FIELD(EXCEPTION_BITMAP, exception_bitmap),
790         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
791         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
792         FIELD(CR3_TARGET_COUNT, cr3_target_count),
793         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
794         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
795         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
796         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
797         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
798         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
799         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
800         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
801         FIELD(TPR_THRESHOLD, tpr_threshold),
802         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
803         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
804         FIELD(VM_EXIT_REASON, vm_exit_reason),
805         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
806         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
807         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
808         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
809         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
810         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
811         FIELD(GUEST_ES_LIMIT, guest_es_limit),
812         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
813         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
814         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
815         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
816         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
817         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
818         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
819         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
820         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
821         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
822         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
823         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
824         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
825         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
826         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
827         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
828         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
829         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
830         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
831         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
832         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
833         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
834         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
835         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
836         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
837         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
838         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
839         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
840         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
841         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
842         FIELD(EXIT_QUALIFICATION, exit_qualification),
843         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
844         FIELD(GUEST_CR0, guest_cr0),
845         FIELD(GUEST_CR3, guest_cr3),
846         FIELD(GUEST_CR4, guest_cr4),
847         FIELD(GUEST_ES_BASE, guest_es_base),
848         FIELD(GUEST_CS_BASE, guest_cs_base),
849         FIELD(GUEST_SS_BASE, guest_ss_base),
850         FIELD(GUEST_DS_BASE, guest_ds_base),
851         FIELD(GUEST_FS_BASE, guest_fs_base),
852         FIELD(GUEST_GS_BASE, guest_gs_base),
853         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
854         FIELD(GUEST_TR_BASE, guest_tr_base),
855         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
856         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
857         FIELD(GUEST_DR7, guest_dr7),
858         FIELD(GUEST_RSP, guest_rsp),
859         FIELD(GUEST_RIP, guest_rip),
860         FIELD(GUEST_RFLAGS, guest_rflags),
861         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
862         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
863         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
864         FIELD(HOST_CR0, host_cr0),
865         FIELD(HOST_CR3, host_cr3),
866         FIELD(HOST_CR4, host_cr4),
867         FIELD(HOST_FS_BASE, host_fs_base),
868         FIELD(HOST_GS_BASE, host_gs_base),
869         FIELD(HOST_TR_BASE, host_tr_base),
870         FIELD(HOST_GDTR_BASE, host_gdtr_base),
871         FIELD(HOST_IDTR_BASE, host_idtr_base),
872         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
873         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
874         FIELD(HOST_RSP, host_rsp),
875         FIELD(HOST_RIP, host_rip),
876 };
877
878 static inline short vmcs_field_to_offset(unsigned long field)
879 {
880         BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
881
882         if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
883             vmcs_field_to_offset_table[field] == 0)
884                 return -ENOENT;
885
886         return vmcs_field_to_offset_table[field];
887 }
888
889 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
890 {
891         return to_vmx(vcpu)->nested.cached_vmcs12;
892 }
893
894 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
895 {
896         struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
897         if (is_error_page(page))
898                 return NULL;
899
900         return page;
901 }
902
903 static void nested_release_page(struct page *page)
904 {
905         kvm_release_page_dirty(page);
906 }
907
908 static void nested_release_page_clean(struct page *page)
909 {
910         kvm_release_page_clean(page);
911 }
912
913 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
914 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
915 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
916 static bool vmx_xsaves_supported(void);
917 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
918 static void vmx_set_segment(struct kvm_vcpu *vcpu,
919                             struct kvm_segment *var, int seg);
920 static void vmx_get_segment(struct kvm_vcpu *vcpu,
921                             struct kvm_segment *var, int seg);
922 static bool guest_state_valid(struct kvm_vcpu *vcpu);
923 static u32 vmx_segment_access_rights(struct kvm_segment *var);
924 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
925 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
926 static int alloc_identity_pagetable(struct kvm *kvm);
927 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
928 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
929 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
930                                             u16 error_code);
931
932 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
933 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
934 /*
935  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
936  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
937  */
938 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
939
940 /*
941  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
942  * can find which vCPU should be waken up.
943  */
944 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
945 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
946
947 enum {
948         VMX_IO_BITMAP_A,
949         VMX_IO_BITMAP_B,
950         VMX_MSR_BITMAP_LEGACY,
951         VMX_MSR_BITMAP_LONGMODE,
952         VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
953         VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
954         VMX_MSR_BITMAP_LEGACY_X2APIC,
955         VMX_MSR_BITMAP_LONGMODE_X2APIC,
956         VMX_VMREAD_BITMAP,
957         VMX_VMWRITE_BITMAP,
958         VMX_BITMAP_NR
959 };
960
961 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
962
963 #define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
964 #define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
965 #define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
966 #define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
967 #define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
968 #define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
969 #define vmx_msr_bitmap_legacy_x2apic         (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
970 #define vmx_msr_bitmap_longmode_x2apic       (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
971 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
972 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
973
974 static bool cpu_has_load_ia32_efer;
975 static bool cpu_has_load_perf_global_ctrl;
976
977 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
978 static DEFINE_SPINLOCK(vmx_vpid_lock);
979
980 static struct vmcs_config {
981         int size;
982         int order;
983         u32 basic_cap;
984         u32 revision_id;
985         u32 pin_based_exec_ctrl;
986         u32 cpu_based_exec_ctrl;
987         u32 cpu_based_2nd_exec_ctrl;
988         u32 vmexit_ctrl;
989         u32 vmentry_ctrl;
990 } vmcs_config;
991
992 static struct vmx_capability {
993         u32 ept;
994         u32 vpid;
995 } vmx_capability;
996
997 #define VMX_SEGMENT_FIELD(seg)                                  \
998         [VCPU_SREG_##seg] = {                                   \
999                 .selector = GUEST_##seg##_SELECTOR,             \
1000                 .base = GUEST_##seg##_BASE,                     \
1001                 .limit = GUEST_##seg##_LIMIT,                   \
1002                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
1003         }
1004
1005 static const struct kvm_vmx_segment_field {
1006         unsigned selector;
1007         unsigned base;
1008         unsigned limit;
1009         unsigned ar_bytes;
1010 } kvm_vmx_segment_fields[] = {
1011         VMX_SEGMENT_FIELD(CS),
1012         VMX_SEGMENT_FIELD(DS),
1013         VMX_SEGMENT_FIELD(ES),
1014         VMX_SEGMENT_FIELD(FS),
1015         VMX_SEGMENT_FIELD(GS),
1016         VMX_SEGMENT_FIELD(SS),
1017         VMX_SEGMENT_FIELD(TR),
1018         VMX_SEGMENT_FIELD(LDTR),
1019 };
1020
1021 static u64 host_efer;
1022
1023 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1024
1025 /*
1026  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1027  * away by decrementing the array size.
1028  */
1029 static const u32 vmx_msr_index[] = {
1030 #ifdef CONFIG_X86_64
1031         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1032 #endif
1033         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1034 };
1035
1036 static inline bool is_exception_n(u32 intr_info, u8 vector)
1037 {
1038         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1039                              INTR_INFO_VALID_MASK)) ==
1040                 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1041 }
1042
1043 static inline bool is_debug(u32 intr_info)
1044 {
1045         return is_exception_n(intr_info, DB_VECTOR);
1046 }
1047
1048 static inline bool is_breakpoint(u32 intr_info)
1049 {
1050         return is_exception_n(intr_info, BP_VECTOR);
1051 }
1052
1053 static inline bool is_page_fault(u32 intr_info)
1054 {
1055         return is_exception_n(intr_info, PF_VECTOR);
1056 }
1057
1058 static inline bool is_no_device(u32 intr_info)
1059 {
1060         return is_exception_n(intr_info, NM_VECTOR);
1061 }
1062
1063 static inline bool is_invalid_opcode(u32 intr_info)
1064 {
1065         return is_exception_n(intr_info, UD_VECTOR);
1066 }
1067
1068 static inline bool is_external_interrupt(u32 intr_info)
1069 {
1070         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1071                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1072 }
1073
1074 static inline bool is_machine_check(u32 intr_info)
1075 {
1076         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1077                              INTR_INFO_VALID_MASK)) ==
1078                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1079 }
1080
1081 static inline bool cpu_has_vmx_msr_bitmap(void)
1082 {
1083         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1084 }
1085
1086 static inline bool cpu_has_vmx_tpr_shadow(void)
1087 {
1088         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1089 }
1090
1091 static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1092 {
1093         return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1094 }
1095
1096 static inline bool cpu_has_secondary_exec_ctrls(void)
1097 {
1098         return vmcs_config.cpu_based_exec_ctrl &
1099                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1100 }
1101
1102 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1103 {
1104         return vmcs_config.cpu_based_2nd_exec_ctrl &
1105                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1106 }
1107
1108 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1109 {
1110         return vmcs_config.cpu_based_2nd_exec_ctrl &
1111                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1112 }
1113
1114 static inline bool cpu_has_vmx_apic_register_virt(void)
1115 {
1116         return vmcs_config.cpu_based_2nd_exec_ctrl &
1117                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1118 }
1119
1120 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1121 {
1122         return vmcs_config.cpu_based_2nd_exec_ctrl &
1123                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1124 }
1125
1126 /*
1127  * Comment's format: document - errata name - stepping - processor name.
1128  * Refer from
1129  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1130  */
1131 static u32 vmx_preemption_cpu_tfms[] = {
1132 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
1133 0x000206E6,
1134 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
1135 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1136 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1137 0x00020652,
1138 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1139 0x00020655,
1140 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
1141 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
1142 /*
1143  * 320767.pdf - AAP86  - B1 -
1144  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1145  */
1146 0x000106E5,
1147 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1148 0x000106A0,
1149 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1150 0x000106A1,
1151 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1152 0x000106A4,
1153  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1154  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1155  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1156 0x000106A5,
1157 };
1158
1159 static inline bool cpu_has_broken_vmx_preemption_timer(void)
1160 {
1161         u32 eax = cpuid_eax(0x00000001), i;
1162
1163         /* Clear the reserved bits */
1164         eax &= ~(0x3U << 14 | 0xfU << 28);
1165         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1166                 if (eax == vmx_preemption_cpu_tfms[i])
1167                         return true;
1168
1169         return false;
1170 }
1171
1172 static inline bool cpu_has_vmx_preemption_timer(void)
1173 {
1174         return vmcs_config.pin_based_exec_ctrl &
1175                 PIN_BASED_VMX_PREEMPTION_TIMER;
1176 }
1177
1178 static inline bool cpu_has_vmx_posted_intr(void)
1179 {
1180         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1181                 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1182 }
1183
1184 static inline bool cpu_has_vmx_apicv(void)
1185 {
1186         return cpu_has_vmx_apic_register_virt() &&
1187                 cpu_has_vmx_virtual_intr_delivery() &&
1188                 cpu_has_vmx_posted_intr();
1189 }
1190
1191 static inline bool cpu_has_vmx_flexpriority(void)
1192 {
1193         return cpu_has_vmx_tpr_shadow() &&
1194                 cpu_has_vmx_virtualize_apic_accesses();
1195 }
1196
1197 static inline bool cpu_has_vmx_ept_execute_only(void)
1198 {
1199         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1200 }
1201
1202 static inline bool cpu_has_vmx_ept_2m_page(void)
1203 {
1204         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1205 }
1206
1207 static inline bool cpu_has_vmx_ept_1g_page(void)
1208 {
1209         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1210 }
1211
1212 static inline bool cpu_has_vmx_ept_4levels(void)
1213 {
1214         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1215 }
1216
1217 static inline bool cpu_has_vmx_ept_ad_bits(void)
1218 {
1219         return vmx_capability.ept & VMX_EPT_AD_BIT;
1220 }
1221
1222 static inline bool cpu_has_vmx_invept_context(void)
1223 {
1224         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1225 }
1226
1227 static inline bool cpu_has_vmx_invept_global(void)
1228 {
1229         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1230 }
1231
1232 static inline bool cpu_has_vmx_invvpid_single(void)
1233 {
1234         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1235 }
1236
1237 static inline bool cpu_has_vmx_invvpid_global(void)
1238 {
1239         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1240 }
1241
1242 static inline bool cpu_has_vmx_invvpid(void)
1243 {
1244         return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1245 }
1246
1247 static inline bool cpu_has_vmx_ept(void)
1248 {
1249         return vmcs_config.cpu_based_2nd_exec_ctrl &
1250                 SECONDARY_EXEC_ENABLE_EPT;
1251 }
1252
1253 static inline bool cpu_has_vmx_unrestricted_guest(void)
1254 {
1255         return vmcs_config.cpu_based_2nd_exec_ctrl &
1256                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1257 }
1258
1259 static inline bool cpu_has_vmx_ple(void)
1260 {
1261         return vmcs_config.cpu_based_2nd_exec_ctrl &
1262                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1263 }
1264
1265 static inline bool cpu_has_vmx_basic_inout(void)
1266 {
1267         return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1268 }
1269
1270 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1271 {
1272         return flexpriority_enabled && lapic_in_kernel(vcpu);
1273 }
1274
1275 static inline bool cpu_has_vmx_vpid(void)
1276 {
1277         return vmcs_config.cpu_based_2nd_exec_ctrl &
1278                 SECONDARY_EXEC_ENABLE_VPID;
1279 }
1280
1281 static inline bool cpu_has_vmx_rdtscp(void)
1282 {
1283         return vmcs_config.cpu_based_2nd_exec_ctrl &
1284                 SECONDARY_EXEC_RDTSCP;
1285 }
1286
1287 static inline bool cpu_has_vmx_invpcid(void)
1288 {
1289         return vmcs_config.cpu_based_2nd_exec_ctrl &
1290                 SECONDARY_EXEC_ENABLE_INVPCID;
1291 }
1292
1293 static inline bool cpu_has_vmx_wbinvd_exit(void)
1294 {
1295         return vmcs_config.cpu_based_2nd_exec_ctrl &
1296                 SECONDARY_EXEC_WBINVD_EXITING;
1297 }
1298
1299 static inline bool cpu_has_vmx_shadow_vmcs(void)
1300 {
1301         u64 vmx_msr;
1302         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1303         /* check if the cpu supports writing r/o exit information fields */
1304         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1305                 return false;
1306
1307         return vmcs_config.cpu_based_2nd_exec_ctrl &
1308                 SECONDARY_EXEC_SHADOW_VMCS;
1309 }
1310
1311 static inline bool cpu_has_vmx_pml(void)
1312 {
1313         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1314 }
1315
1316 static inline bool cpu_has_vmx_tsc_scaling(void)
1317 {
1318         return vmcs_config.cpu_based_2nd_exec_ctrl &
1319                 SECONDARY_EXEC_TSC_SCALING;
1320 }
1321
1322 static inline bool report_flexpriority(void)
1323 {
1324         return flexpriority_enabled;
1325 }
1326
1327 static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1328 {
1329         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
1330 }
1331
1332 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1333 {
1334         return vmcs12->cpu_based_vm_exec_control & bit;
1335 }
1336
1337 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1338 {
1339         return (vmcs12->cpu_based_vm_exec_control &
1340                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1341                 (vmcs12->secondary_vm_exec_control & bit);
1342 }
1343
1344 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1345 {
1346         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1347 }
1348
1349 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1350 {
1351         return vmcs12->pin_based_vm_exec_control &
1352                 PIN_BASED_VMX_PREEMPTION_TIMER;
1353 }
1354
1355 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1356 {
1357         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1358 }
1359
1360 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1361 {
1362         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
1363                 vmx_xsaves_supported();
1364 }
1365
1366 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
1367 {
1368         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
1369 }
1370
1371 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1372 {
1373         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1374 }
1375
1376 static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1377 {
1378         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1379 }
1380
1381 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1382 {
1383         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1384 }
1385
1386 static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1387 {
1388         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1389 }
1390
1391 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1392 {
1393         return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1394 }
1395
1396 static inline bool is_nmi(u32 intr_info)
1397 {
1398         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1399                 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
1400 }
1401
1402 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1403                               u32 exit_intr_info,
1404                               unsigned long exit_qualification);
1405 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1406                         struct vmcs12 *vmcs12,
1407                         u32 reason, unsigned long qualification);
1408
1409 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1410 {
1411         int i;
1412
1413         for (i = 0; i < vmx->nmsrs; ++i)
1414                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1415                         return i;
1416         return -1;
1417 }
1418
1419 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1420 {
1421     struct {
1422         u64 vpid : 16;
1423         u64 rsvd : 48;
1424         u64 gva;
1425     } operand = { vpid, 0, gva };
1426
1427     asm volatile (__ex(ASM_VMX_INVVPID)
1428                   /* CF==1 or ZF==1 --> rc = -1 */
1429                   "; ja 1f ; ud2 ; 1:"
1430                   : : "a"(&operand), "c"(ext) : "cc", "memory");
1431 }
1432
1433 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1434 {
1435         struct {
1436                 u64 eptp, gpa;
1437         } operand = {eptp, gpa};
1438
1439         asm volatile (__ex(ASM_VMX_INVEPT)
1440                         /* CF==1 or ZF==1 --> rc = -1 */
1441                         "; ja 1f ; ud2 ; 1:\n"
1442                         : : "a" (&operand), "c" (ext) : "cc", "memory");
1443 }
1444
1445 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1446 {
1447         int i;
1448
1449         i = __find_msr_index(vmx, msr);
1450         if (i >= 0)
1451                 return &vmx->guest_msrs[i];
1452         return NULL;
1453 }
1454
1455 static void vmcs_clear(struct vmcs *vmcs)
1456 {
1457         u64 phys_addr = __pa(vmcs);
1458         u8 error;
1459
1460         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1461                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1462                       : "cc", "memory");
1463         if (error)
1464                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1465                        vmcs, phys_addr);
1466 }
1467
1468 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1469 {
1470         vmcs_clear(loaded_vmcs->vmcs);
1471         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1472                 vmcs_clear(loaded_vmcs->shadow_vmcs);
1473         loaded_vmcs->cpu = -1;
1474         loaded_vmcs->launched = 0;
1475 }
1476
1477 static void vmcs_load(struct vmcs *vmcs)
1478 {
1479         u64 phys_addr = __pa(vmcs);
1480         u8 error;
1481
1482         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1483                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1484                         : "cc", "memory");
1485         if (error)
1486                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1487                        vmcs, phys_addr);
1488 }
1489
1490 #ifdef CONFIG_KEXEC_CORE
1491 /*
1492  * This bitmap is used to indicate whether the vmclear
1493  * operation is enabled on all cpus. All disabled by
1494  * default.
1495  */
1496 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1497
1498 static inline void crash_enable_local_vmclear(int cpu)
1499 {
1500         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1501 }
1502
1503 static inline void crash_disable_local_vmclear(int cpu)
1504 {
1505         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1506 }
1507
1508 static inline int crash_local_vmclear_enabled(int cpu)
1509 {
1510         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1511 }
1512
1513 static void crash_vmclear_local_loaded_vmcss(void)
1514 {
1515         int cpu = raw_smp_processor_id();
1516         struct loaded_vmcs *v;
1517
1518         if (!crash_local_vmclear_enabled(cpu))
1519                 return;
1520
1521         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1522                             loaded_vmcss_on_cpu_link)
1523                 vmcs_clear(v->vmcs);
1524 }
1525 #else
1526 static inline void crash_enable_local_vmclear(int cpu) { }
1527 static inline void crash_disable_local_vmclear(int cpu) { }
1528 #endif /* CONFIG_KEXEC_CORE */
1529
1530 static void __loaded_vmcs_clear(void *arg)
1531 {
1532         struct loaded_vmcs *loaded_vmcs = arg;
1533         int cpu = raw_smp_processor_id();
1534
1535         if (loaded_vmcs->cpu != cpu)
1536                 return; /* vcpu migration can race with cpu offline */
1537         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1538                 per_cpu(current_vmcs, cpu) = NULL;
1539         crash_disable_local_vmclear(cpu);
1540         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1541
1542         /*
1543          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1544          * is before setting loaded_vmcs->vcpu to -1 which is done in
1545          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1546          * then adds the vmcs into percpu list before it is deleted.
1547          */
1548         smp_wmb();
1549
1550         loaded_vmcs_init(loaded_vmcs);
1551         crash_enable_local_vmclear(cpu);
1552 }
1553
1554 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1555 {
1556         int cpu = loaded_vmcs->cpu;
1557
1558         if (cpu != -1)
1559                 smp_call_function_single(cpu,
1560                          __loaded_vmcs_clear, loaded_vmcs, 1);
1561 }
1562
1563 static inline void vpid_sync_vcpu_single(int vpid)
1564 {
1565         if (vpid == 0)
1566                 return;
1567
1568         if (cpu_has_vmx_invvpid_single())
1569                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1570 }
1571
1572 static inline void vpid_sync_vcpu_global(void)
1573 {
1574         if (cpu_has_vmx_invvpid_global())
1575                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1576 }
1577
1578 static inline void vpid_sync_context(int vpid)
1579 {
1580         if (cpu_has_vmx_invvpid_single())
1581                 vpid_sync_vcpu_single(vpid);
1582         else
1583                 vpid_sync_vcpu_global();
1584 }
1585
1586 static inline void ept_sync_global(void)
1587 {
1588         if (cpu_has_vmx_invept_global())
1589                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1590 }
1591
1592 static inline void ept_sync_context(u64 eptp)
1593 {
1594         if (enable_ept) {
1595                 if (cpu_has_vmx_invept_context())
1596                         __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1597                 else
1598                         ept_sync_global();
1599         }
1600 }
1601
1602 static __always_inline void vmcs_check16(unsigned long field)
1603 {
1604         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1605                          "16-bit accessor invalid for 64-bit field");
1606         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1607                          "16-bit accessor invalid for 64-bit high field");
1608         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1609                          "16-bit accessor invalid for 32-bit high field");
1610         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1611                          "16-bit accessor invalid for natural width field");
1612 }
1613
1614 static __always_inline void vmcs_check32(unsigned long field)
1615 {
1616         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1617                          "32-bit accessor invalid for 16-bit field");
1618         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1619                          "32-bit accessor invalid for natural width field");
1620 }
1621
1622 static __always_inline void vmcs_check64(unsigned long field)
1623 {
1624         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1625                          "64-bit accessor invalid for 16-bit field");
1626         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1627                          "64-bit accessor invalid for 64-bit high field");
1628         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1629                          "64-bit accessor invalid for 32-bit field");
1630         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1631                          "64-bit accessor invalid for natural width field");
1632 }
1633
1634 static __always_inline void vmcs_checkl(unsigned long field)
1635 {
1636         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1637                          "Natural width accessor invalid for 16-bit field");
1638         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1639                          "Natural width accessor invalid for 64-bit field");
1640         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1641                          "Natural width accessor invalid for 64-bit high field");
1642         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1643                          "Natural width accessor invalid for 32-bit field");
1644 }
1645
1646 static __always_inline unsigned long __vmcs_readl(unsigned long field)
1647 {
1648         unsigned long value;
1649
1650         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1651                       : "=a"(value) : "d"(field) : "cc");
1652         return value;
1653 }
1654
1655 static __always_inline u16 vmcs_read16(unsigned long field)
1656 {
1657         vmcs_check16(field);
1658         return __vmcs_readl(field);
1659 }
1660
1661 static __always_inline u32 vmcs_read32(unsigned long field)
1662 {
1663         vmcs_check32(field);
1664         return __vmcs_readl(field);
1665 }
1666
1667 static __always_inline u64 vmcs_read64(unsigned long field)
1668 {
1669         vmcs_check64(field);
1670 #ifdef CONFIG_X86_64
1671         return __vmcs_readl(field);
1672 #else
1673         return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
1674 #endif
1675 }
1676
1677 static __always_inline unsigned long vmcs_readl(unsigned long field)
1678 {
1679         vmcs_checkl(field);
1680         return __vmcs_readl(field);
1681 }
1682
1683 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1684 {
1685         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1686                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1687         dump_stack();
1688 }
1689
1690 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
1691 {
1692         u8 error;
1693
1694         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1695                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1696         if (unlikely(error))
1697                 vmwrite_error(field, value);
1698 }
1699
1700 static __always_inline void vmcs_write16(unsigned long field, u16 value)
1701 {
1702         vmcs_check16(field);
1703         __vmcs_writel(field, value);
1704 }
1705
1706 static __always_inline void vmcs_write32(unsigned long field, u32 value)
1707 {
1708         vmcs_check32(field);
1709         __vmcs_writel(field, value);
1710 }
1711
1712 static __always_inline void vmcs_write64(unsigned long field, u64 value)
1713 {
1714         vmcs_check64(field);
1715         __vmcs_writel(field, value);
1716 #ifndef CONFIG_X86_64
1717         asm volatile ("");
1718         __vmcs_writel(field+1, value >> 32);
1719 #endif
1720 }
1721
1722 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
1723 {
1724         vmcs_checkl(field);
1725         __vmcs_writel(field, value);
1726 }
1727
1728 static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
1729 {
1730         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1731                          "vmcs_clear_bits does not support 64-bit fields");
1732         __vmcs_writel(field, __vmcs_readl(field) & ~mask);
1733 }
1734
1735 static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
1736 {
1737         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1738                          "vmcs_set_bits does not support 64-bit fields");
1739         __vmcs_writel(field, __vmcs_readl(field) | mask);
1740 }
1741
1742 static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
1743 {
1744         vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
1745 }
1746
1747 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1748 {
1749         vmcs_write32(VM_ENTRY_CONTROLS, val);
1750         vmx->vm_entry_controls_shadow = val;
1751 }
1752
1753 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1754 {
1755         if (vmx->vm_entry_controls_shadow != val)
1756                 vm_entry_controls_init(vmx, val);
1757 }
1758
1759 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1760 {
1761         return vmx->vm_entry_controls_shadow;
1762 }
1763
1764
1765 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1766 {
1767         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1768 }
1769
1770 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1771 {
1772         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1773 }
1774
1775 static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
1776 {
1777         vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
1778 }
1779
1780 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1781 {
1782         vmcs_write32(VM_EXIT_CONTROLS, val);
1783         vmx->vm_exit_controls_shadow = val;
1784 }
1785
1786 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1787 {
1788         if (vmx->vm_exit_controls_shadow != val)
1789                 vm_exit_controls_init(vmx, val);
1790 }
1791
1792 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1793 {
1794         return vmx->vm_exit_controls_shadow;
1795 }
1796
1797
1798 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1799 {
1800         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1801 }
1802
1803 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1804 {
1805         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1806 }
1807
1808 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1809 {
1810         vmx->segment_cache.bitmask = 0;
1811 }
1812
1813 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1814                                        unsigned field)
1815 {
1816         bool ret;
1817         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1818
1819         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1820                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1821                 vmx->segment_cache.bitmask = 0;
1822         }
1823         ret = vmx->segment_cache.bitmask & mask;
1824         vmx->segment_cache.bitmask |= mask;
1825         return ret;
1826 }
1827
1828 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1829 {
1830         u16 *p = &vmx->segment_cache.seg[seg].selector;
1831
1832         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1833                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1834         return *p;
1835 }
1836
1837 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1838 {
1839         ulong *p = &vmx->segment_cache.seg[seg].base;
1840
1841         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1842                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1843         return *p;
1844 }
1845
1846 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1847 {
1848         u32 *p = &vmx->segment_cache.seg[seg].limit;
1849
1850         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1851                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1852         return *p;
1853 }
1854
1855 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1856 {
1857         u32 *p = &vmx->segment_cache.seg[seg].ar;
1858
1859         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1860                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1861         return *p;
1862 }
1863
1864 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1865 {
1866         u32 eb;
1867
1868         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1869              (1u << DB_VECTOR) | (1u << AC_VECTOR);
1870         if ((vcpu->guest_debug &
1871              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1872             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1873                 eb |= 1u << BP_VECTOR;
1874         if (to_vmx(vcpu)->rmode.vm86_active)
1875                 eb = ~0;
1876         if (enable_ept)
1877                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1878
1879         /* When we are running a nested L2 guest and L1 specified for it a
1880          * certain exception bitmap, we must trap the same exceptions and pass
1881          * them to L1. When running L2, we will only handle the exceptions
1882          * specified above if L1 did not want them.
1883          */
1884         if (is_guest_mode(vcpu))
1885                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1886
1887         vmcs_write32(EXCEPTION_BITMAP, eb);
1888 }
1889
1890 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1891                 unsigned long entry, unsigned long exit)
1892 {
1893         vm_entry_controls_clearbit(vmx, entry);
1894         vm_exit_controls_clearbit(vmx, exit);
1895 }
1896
1897 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1898 {
1899         unsigned i;
1900         struct msr_autoload *m = &vmx->msr_autoload;
1901
1902         switch (msr) {
1903         case MSR_EFER:
1904                 if (cpu_has_load_ia32_efer) {
1905                         clear_atomic_switch_msr_special(vmx,
1906                                         VM_ENTRY_LOAD_IA32_EFER,
1907                                         VM_EXIT_LOAD_IA32_EFER);
1908                         return;
1909                 }
1910                 break;
1911         case MSR_CORE_PERF_GLOBAL_CTRL:
1912                 if (cpu_has_load_perf_global_ctrl) {
1913                         clear_atomic_switch_msr_special(vmx,
1914                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1915                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1916                         return;
1917                 }
1918                 break;
1919         }
1920
1921         for (i = 0; i < m->nr; ++i)
1922                 if (m->guest[i].index == msr)
1923                         break;
1924
1925         if (i == m->nr)
1926                 return;
1927         --m->nr;
1928         m->guest[i] = m->guest[m->nr];
1929         m->host[i] = m->host[m->nr];
1930         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1931         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1932 }
1933
1934 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1935                 unsigned long entry, unsigned long exit,
1936                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1937                 u64 guest_val, u64 host_val)
1938 {
1939         vmcs_write64(guest_val_vmcs, guest_val);
1940         vmcs_write64(host_val_vmcs, host_val);
1941         vm_entry_controls_setbit(vmx, entry);
1942         vm_exit_controls_setbit(vmx, exit);
1943 }
1944
1945 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1946                                   u64 guest_val, u64 host_val)
1947 {
1948         unsigned i;
1949         struct msr_autoload *m = &vmx->msr_autoload;
1950
1951         switch (msr) {
1952         case MSR_EFER:
1953                 if (cpu_has_load_ia32_efer) {
1954                         add_atomic_switch_msr_special(vmx,
1955                                         VM_ENTRY_LOAD_IA32_EFER,
1956                                         VM_EXIT_LOAD_IA32_EFER,
1957                                         GUEST_IA32_EFER,
1958                                         HOST_IA32_EFER,
1959                                         guest_val, host_val);
1960                         return;
1961                 }
1962                 break;
1963         case MSR_CORE_PERF_GLOBAL_CTRL:
1964                 if (cpu_has_load_perf_global_ctrl) {
1965                         add_atomic_switch_msr_special(vmx,
1966                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1967                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1968                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1969                                         HOST_IA32_PERF_GLOBAL_CTRL,
1970                                         guest_val, host_val);
1971                         return;
1972                 }
1973                 break;
1974         case MSR_IA32_PEBS_ENABLE:
1975                 /* PEBS needs a quiescent period after being disabled (to write
1976                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1977                  * provide that period, so a CPU could write host's record into
1978                  * guest's memory.
1979                  */
1980                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1981         }
1982
1983         for (i = 0; i < m->nr; ++i)
1984                 if (m->guest[i].index == msr)
1985                         break;
1986
1987         if (i == NR_AUTOLOAD_MSRS) {
1988                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1989                                 "Can't add msr %x\n", msr);
1990                 return;
1991         } else if (i == m->nr) {
1992                 ++m->nr;
1993                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1994                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1995         }
1996
1997         m->guest[i].index = msr;
1998         m->guest[i].value = guest_val;
1999         m->host[i].index = msr;
2000         m->host[i].value = host_val;
2001 }
2002
2003 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2004 {
2005         u64 guest_efer = vmx->vcpu.arch.efer;
2006         u64 ignore_bits = 0;
2007
2008         if (!enable_ept) {
2009                 /*
2010                  * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
2011                  * host CPUID is more efficient than testing guest CPUID
2012                  * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
2013                  */
2014                 if (boot_cpu_has(X86_FEATURE_SMEP))
2015                         guest_efer |= EFER_NX;
2016                 else if (!(guest_efer & EFER_NX))
2017                         ignore_bits |= EFER_NX;
2018         }
2019
2020         /*
2021          * LMA and LME handled by hardware; SCE meaningless outside long mode.
2022          */
2023         ignore_bits |= EFER_SCE;
2024 #ifdef CONFIG_X86_64
2025         ignore_bits |= EFER_LMA | EFER_LME;
2026         /* SCE is meaningful only in long mode on Intel */
2027         if (guest_efer & EFER_LMA)
2028                 ignore_bits &= ~(u64)EFER_SCE;
2029 #endif
2030
2031         clear_atomic_switch_msr(vmx, MSR_EFER);
2032
2033         /*
2034          * On EPT, we can't emulate NX, so we must switch EFER atomically.
2035          * On CPUs that support "load IA32_EFER", always switch EFER
2036          * atomically, since it's faster than switching it manually.
2037          */
2038         if (cpu_has_load_ia32_efer ||
2039             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2040                 if (!(guest_efer & EFER_LMA))
2041                         guest_efer &= ~EFER_LME;
2042                 if (guest_efer != host_efer)
2043                         add_atomic_switch_msr(vmx, MSR_EFER,
2044                                               guest_efer, host_efer);
2045                 return false;
2046         } else {
2047                 guest_efer &= ~ignore_bits;
2048                 guest_efer |= host_efer & ignore_bits;
2049
2050                 vmx->guest_msrs[efer_offset].data = guest_efer;
2051                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2052
2053                 return true;
2054         }
2055 }
2056
2057 #ifdef CONFIG_X86_32
2058 /*
2059  * On 32-bit kernels, VM exits still load the FS and GS bases from the
2060  * VMCS rather than the segment table.  KVM uses this helper to figure
2061  * out the current bases to poke them into the VMCS before entry.
2062  */
2063 static unsigned long segment_base(u16 selector)
2064 {
2065         struct desc_struct *table;
2066         unsigned long v;
2067
2068         if (!(selector & ~SEGMENT_RPL_MASK))
2069                 return 0;
2070
2071         table = get_current_gdt_ro();
2072
2073         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2074                 u16 ldt_selector = kvm_read_ldt();
2075
2076                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2077                         return 0;
2078
2079                 table = (struct desc_struct *)segment_base(ldt_selector);
2080         }
2081         v = get_desc_base(&table[selector >> 3]);
2082         return v;
2083 }
2084 #endif
2085
2086 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2087 {
2088         struct vcpu_vmx *vmx = to_vmx(vcpu);
2089         int i;
2090
2091         if (vmx->host_state.loaded)
2092                 return;
2093
2094         vmx->host_state.loaded = 1;
2095         /*
2096          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2097          * allow segment selectors with cpl > 0 or ti == 1.
2098          */
2099         vmx->host_state.ldt_sel = kvm_read_ldt();
2100         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
2101         savesegment(fs, vmx->host_state.fs_sel);
2102         if (!(vmx->host_state.fs_sel & 7)) {
2103                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
2104                 vmx->host_state.fs_reload_needed = 0;
2105         } else {
2106                 vmcs_write16(HOST_FS_SELECTOR, 0);
2107                 vmx->host_state.fs_reload_needed = 1;
2108         }
2109         savesegment(gs, vmx->host_state.gs_sel);
2110         if (!(vmx->host_state.gs_sel & 7))
2111                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
2112         else {
2113                 vmcs_write16(HOST_GS_SELECTOR, 0);
2114                 vmx->host_state.gs_ldt_reload_needed = 1;
2115         }
2116
2117 #ifdef CONFIG_X86_64
2118         savesegment(ds, vmx->host_state.ds_sel);
2119         savesegment(es, vmx->host_state.es_sel);
2120 #endif
2121
2122 #ifdef CONFIG_X86_64
2123         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2124         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2125 #else
2126         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
2127         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2128 #endif
2129
2130 #ifdef CONFIG_X86_64
2131         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2132         if (is_long_mode(&vmx->vcpu))
2133                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2134 #endif
2135         if (boot_cpu_has(X86_FEATURE_MPX))
2136                 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2137         for (i = 0; i < vmx->save_nmsrs; ++i)
2138                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2139                                    vmx->guest_msrs[i].data,
2140                                    vmx->guest_msrs[i].mask);
2141 }
2142
2143 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2144 {
2145         if (!vmx->host_state.loaded)
2146                 return;
2147
2148         ++vmx->vcpu.stat.host_state_reload;
2149         vmx->host_state.loaded = 0;
2150 #ifdef CONFIG_X86_64
2151         if (is_long_mode(&vmx->vcpu))
2152                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2153 #endif
2154         if (vmx->host_state.gs_ldt_reload_needed) {
2155                 kvm_load_ldt(vmx->host_state.ldt_sel);
2156 #ifdef CONFIG_X86_64
2157                 load_gs_index(vmx->host_state.gs_sel);
2158 #else
2159                 loadsegment(gs, vmx->host_state.gs_sel);
2160 #endif
2161         }
2162         if (vmx->host_state.fs_reload_needed)
2163                 loadsegment(fs, vmx->host_state.fs_sel);
2164 #ifdef CONFIG_X86_64
2165         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
2166                 loadsegment(ds, vmx->host_state.ds_sel);
2167                 loadsegment(es, vmx->host_state.es_sel);
2168         }
2169 #endif
2170         invalidate_tss_limit();
2171 #ifdef CONFIG_X86_64
2172         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2173 #endif
2174         if (vmx->host_state.msr_host_bndcfgs)
2175                 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2176         load_fixmap_gdt(raw_smp_processor_id());
2177 }
2178
2179 static void vmx_load_host_state(struct vcpu_vmx *vmx)
2180 {
2181         preempt_disable();
2182         __vmx_load_host_state(vmx);
2183         preempt_enable();
2184 }
2185
2186 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2187 {
2188         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2189         struct pi_desc old, new;
2190         unsigned int dest;
2191
2192         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2193                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
2194                 !kvm_vcpu_apicv_active(vcpu))
2195                 return;
2196
2197         do {
2198                 old.control = new.control = pi_desc->control;
2199
2200                 /*
2201                  * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
2202                  * are two possible cases:
2203                  * 1. After running 'pre_block', context switch
2204                  *    happened. For this case, 'sn' was set in
2205                  *    vmx_vcpu_put(), so we need to clear it here.
2206                  * 2. After running 'pre_block', we were blocked,
2207                  *    and woken up by some other guy. For this case,
2208                  *    we don't need to do anything, 'pi_post_block'
2209                  *    will do everything for us. However, we cannot
2210                  *    check whether it is case #1 or case #2 here
2211                  *    (maybe, not needed), so we also clear sn here,
2212                  *    I think it is not a big deal.
2213                  */
2214                 if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
2215                         if (vcpu->cpu != cpu) {
2216                                 dest = cpu_physical_id(cpu);
2217
2218                                 if (x2apic_enabled())
2219                                         new.ndst = dest;
2220                                 else
2221                                         new.ndst = (dest << 8) & 0xFF00;
2222                         }
2223
2224                         /* set 'NV' to 'notification vector' */
2225                         new.nv = POSTED_INTR_VECTOR;
2226                 }
2227
2228                 /* Allow posting non-urgent interrupts */
2229                 new.sn = 0;
2230         } while (cmpxchg(&pi_desc->control, old.control,
2231                         new.control) != old.control);
2232 }
2233
2234 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
2235 {
2236         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
2237         vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
2238 }
2239
2240 /*
2241  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2242  * vcpu mutex is already taken.
2243  */
2244 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2245 {
2246         struct vcpu_vmx *vmx = to_vmx(vcpu);
2247         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2248
2249         if (!already_loaded) {
2250                 loaded_vmcs_clear(vmx->loaded_vmcs);
2251                 local_irq_disable();
2252                 crash_disable_local_vmclear(cpu);
2253
2254                 /*
2255                  * Read loaded_vmcs->cpu should be before fetching
2256                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
2257                  * See the comments in __loaded_vmcs_clear().
2258                  */
2259                 smp_rmb();
2260
2261                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2262                          &per_cpu(loaded_vmcss_on_cpu, cpu));
2263                 crash_enable_local_vmclear(cpu);
2264                 local_irq_enable();
2265         }
2266
2267         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2268                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2269                 vmcs_load(vmx->loaded_vmcs->vmcs);
2270         }
2271
2272         if (!already_loaded) {
2273                 void *gdt = get_current_gdt_ro();
2274                 unsigned long sysenter_esp;
2275
2276                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2277
2278                 /*
2279                  * Linux uses per-cpu TSS and GDT, so set these when switching
2280                  * processors.  See 22.2.4.
2281                  */
2282                 vmcs_writel(HOST_TR_BASE,
2283                             (unsigned long)this_cpu_ptr(&cpu_tss));
2284                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
2285
2286                 /*
2287                  * VM exits change the host TR limit to 0x67 after a VM
2288                  * exit.  This is okay, since 0x67 covers everything except
2289                  * the IO bitmap and have have code to handle the IO bitmap
2290                  * being lost after a VM exit.
2291                  */
2292                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2293
2294                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2295                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2296
2297                 vmx->loaded_vmcs->cpu = cpu;
2298         }
2299
2300         /* Setup TSC multiplier */
2301         if (kvm_has_tsc_control &&
2302             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2303                 decache_tsc_multiplier(vmx);
2304
2305         vmx_vcpu_pi_load(vcpu, cpu);
2306         vmx->host_pkru = read_pkru();
2307 }
2308
2309 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2310 {
2311         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2312
2313         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2314                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
2315                 !kvm_vcpu_apicv_active(vcpu))
2316                 return;
2317
2318         /* Set SN when the vCPU is preempted */
2319         if (vcpu->preempted)
2320                 pi_set_sn(pi_desc);
2321 }
2322
2323 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2324 {
2325         vmx_vcpu_pi_put(vcpu);
2326
2327         __vmx_load_host_state(to_vmx(vcpu));
2328 }
2329
2330 static bool emulation_required(struct kvm_vcpu *vcpu)
2331 {
2332         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2333 }
2334
2335 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2336
2337 /*
2338  * Return the cr0 value that a nested guest would read. This is a combination
2339  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
2340  * its hypervisor (cr0_read_shadow).
2341  */
2342 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
2343 {
2344         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
2345                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
2346 }
2347 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2348 {
2349         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
2350                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2351 }
2352
2353 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2354 {
2355         unsigned long rflags, save_rflags;
2356
2357         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2358                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2359                 rflags = vmcs_readl(GUEST_RFLAGS);
2360                 if (to_vmx(vcpu)->rmode.vm86_active) {
2361                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2362                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2363                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2364                 }
2365                 to_vmx(vcpu)->rflags = rflags;
2366         }
2367         return to_vmx(vcpu)->rflags;
2368 }
2369
2370 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2371 {
2372         unsigned long old_rflags = vmx_get_rflags(vcpu);
2373
2374         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2375         to_vmx(vcpu)->rflags = rflags;
2376         if (to_vmx(vcpu)->rmode.vm86_active) {
2377                 to_vmx(vcpu)->rmode.save_rflags = rflags;
2378                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2379         }
2380         vmcs_writel(GUEST_RFLAGS, rflags);
2381
2382         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
2383                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
2384 }
2385
2386 static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
2387 {
2388         return to_vmx(vcpu)->guest_pkru;
2389 }
2390
2391 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2392 {
2393         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2394         int ret = 0;
2395
2396         if (interruptibility & GUEST_INTR_STATE_STI)
2397                 ret |= KVM_X86_SHADOW_INT_STI;
2398         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2399                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2400
2401         return ret;
2402 }
2403
2404 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2405 {
2406         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2407         u32 interruptibility = interruptibility_old;
2408
2409         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2410
2411         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2412                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
2413         else if (mask & KVM_X86_SHADOW_INT_STI)
2414                 interruptibility |= GUEST_INTR_STATE_STI;
2415
2416         if ((interruptibility != interruptibility_old))
2417                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2418 }
2419
2420 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
2421 {
2422         unsigned long rip;
2423
2424         rip = kvm_rip_read(vcpu);
2425         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2426         kvm_rip_write(vcpu, rip);
2427
2428         /* skipping an emulated instruction also counts */
2429         vmx_set_interrupt_shadow(vcpu, 0);
2430 }
2431
2432 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
2433                                                unsigned long exit_qual)
2434 {
2435         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2436         unsigned int nr = vcpu->arch.exception.nr;
2437         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2438
2439         if (vcpu->arch.exception.has_error_code) {
2440                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
2441                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2442         }
2443
2444         if (kvm_exception_is_soft(nr))
2445                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2446         else
2447                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2448
2449         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
2450             vmx_get_nmi_mask(vcpu))
2451                 intr_info |= INTR_INFO_UNBLOCK_NMI;
2452
2453         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
2454 }
2455
2456 /*
2457  * KVM wants to inject page-faults which it got to the guest. This function
2458  * checks whether in a nested guest, we need to inject them to L1 or L2.
2459  */
2460 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
2461 {
2462         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2463         unsigned int nr = vcpu->arch.exception.nr;
2464
2465         if (nr == PF_VECTOR) {
2466                 if (vcpu->arch.exception.nested_apf) {
2467                         nested_vmx_inject_exception_vmexit(vcpu,
2468                                                            vcpu->arch.apf.nested_apf_token);
2469                         return 1;
2470                 }
2471                 /*
2472                  * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
2473                  * The fix is to add the ancillary datum (CR2 or DR6) to structs
2474                  * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
2475                  * can be written only when inject_pending_event runs.  This should be
2476                  * conditional on a new capability---if the capability is disabled,
2477                  * kvm_multiple_exception would write the ancillary information to
2478                  * CR2 or DR6, for backwards ABI-compatibility.
2479                  */
2480                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
2481                                                     vcpu->arch.exception.error_code)) {
2482                         nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
2483                         return 1;
2484                 }
2485         } else {
2486                 unsigned long exit_qual = 0;
2487                 if (nr == DB_VECTOR)
2488                         exit_qual = vcpu->arch.dr6;
2489
2490                 if (vmcs12->exception_bitmap & (1u << nr)) {
2491                         nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
2492                         return 1;
2493                 }
2494         }
2495
2496         return 0;
2497 }
2498
2499 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2500 {
2501         struct vcpu_vmx *vmx = to_vmx(vcpu);
2502         unsigned nr = vcpu->arch.exception.nr;
2503         bool has_error_code = vcpu->arch.exception.has_error_code;
2504         bool reinject = vcpu->arch.exception.reinject;
2505         u32 error_code = vcpu->arch.exception.error_code;
2506         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2507
2508         if (!reinject && is_guest_mode(vcpu) &&
2509             nested_vmx_check_exception(vcpu))
2510                 return;
2511
2512         if (has_error_code) {
2513                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2514                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2515         }
2516
2517         if (vmx->rmode.vm86_active) {
2518                 int inc_eip = 0;
2519                 if (kvm_exception_is_soft(nr))
2520                         inc_eip = vcpu->arch.event_exit_inst_len;
2521                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2522                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2523                 return;
2524         }
2525
2526         if (kvm_exception_is_soft(nr)) {
2527                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2528                              vmx->vcpu.arch.event_exit_inst_len);
2529                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2530         } else
2531                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2532
2533         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2534 }
2535
2536 static bool vmx_rdtscp_supported(void)
2537 {
2538         return cpu_has_vmx_rdtscp();
2539 }
2540
2541 static bool vmx_invpcid_supported(void)
2542 {
2543         return cpu_has_vmx_invpcid() && enable_ept;
2544 }
2545
2546 /*
2547  * Swap MSR entry in host/guest MSR entry array.
2548  */
2549 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2550 {
2551         struct shared_msr_entry tmp;
2552
2553         tmp = vmx->guest_msrs[to];
2554         vmx->guest_msrs[to] = vmx->guest_msrs[from];
2555         vmx->guest_msrs[from] = tmp;
2556 }
2557
2558 static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2559 {
2560         unsigned long *msr_bitmap;
2561
2562         if (is_guest_mode(vcpu))
2563                 msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2564         else if (cpu_has_secondary_exec_ctrls() &&
2565                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2566                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2567                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
2568                         if (is_long_mode(vcpu))
2569                                 msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2570                         else
2571                                 msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2572                 } else {
2573                         if (is_long_mode(vcpu))
2574                                 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2575                         else
2576                                 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2577                 }
2578         } else {
2579                 if (is_long_mode(vcpu))
2580                         msr_bitmap = vmx_msr_bitmap_longmode;
2581                 else
2582                         msr_bitmap = vmx_msr_bitmap_legacy;
2583         }
2584
2585         vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2586 }
2587
2588 /*
2589  * Set up the vmcs to automatically save and restore system
2590  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2591  * mode, as fiddling with msrs is very expensive.
2592  */
2593 static void setup_msrs(struct vcpu_vmx *vmx)
2594 {
2595         int save_nmsrs, index;
2596
2597         save_nmsrs = 0;
2598 #ifdef CONFIG_X86_64
2599         if (is_long_mode(&vmx->vcpu)) {
2600                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2601                 if (index >= 0)
2602                         move_msr_up(vmx, index, save_nmsrs++);
2603                 index = __find_msr_index(vmx, MSR_LSTAR);
2604                 if (index >= 0)
2605                         move_msr_up(vmx, index, save_nmsrs++);
2606                 index = __find_msr_index(vmx, MSR_CSTAR);
2607                 if (index >= 0)
2608                         move_msr_up(vmx, index, save_nmsrs++);
2609                 index = __find_msr_index(vmx, MSR_TSC_AUX);
2610                 if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
2611                         move_msr_up(vmx, index, save_nmsrs++);
2612                 /*
2613                  * MSR_STAR is only needed on long mode guests, and only
2614                  * if efer.sce is enabled.
2615                  */
2616                 index = __find_msr_index(vmx, MSR_STAR);
2617                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2618                         move_msr_up(vmx, index, save_nmsrs++);
2619         }
2620 #endif
2621         index = __find_msr_index(vmx, MSR_EFER);
2622         if (index >= 0 && update_transition_efer(vmx, index))
2623                 move_msr_up(vmx, index, save_nmsrs++);
2624
2625         vmx->save_nmsrs = save_nmsrs;
2626
2627         if (cpu_has_vmx_msr_bitmap())
2628                 vmx_set_msr_bitmap(&vmx->vcpu);
2629 }
2630
2631 /*
2632  * reads and returns guest's timestamp counter "register"
2633  * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
2634  * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
2635  */
2636 static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
2637 {
2638         u64 host_tsc, tsc_offset;
2639
2640         host_tsc = rdtsc();
2641         tsc_offset = vmcs_read64(TSC_OFFSET);
2642         return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
2643 }
2644
2645 /*
2646  * writes 'offset' into guest's timestamp counter offset register
2647  */
2648 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2649 {
2650         if (is_guest_mode(vcpu)) {
2651                 /*
2652                  * We're here if L1 chose not to trap WRMSR to TSC. According
2653                  * to the spec, this should set L1's TSC; The offset that L1
2654                  * set for L2 remains unchanged, and still needs to be added
2655                  * to the newly set TSC to get L2's TSC.
2656                  */
2657                 struct vmcs12 *vmcs12;
2658                 /* recalculate vmcs02.TSC_OFFSET: */
2659                 vmcs12 = get_vmcs12(vcpu);
2660                 vmcs_write64(TSC_OFFSET, offset +
2661                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2662                          vmcs12->tsc_offset : 0));
2663         } else {
2664                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2665                                            vmcs_read64(TSC_OFFSET), offset);
2666                 vmcs_write64(TSC_OFFSET, offset);
2667         }
2668 }
2669
2670 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2671 {
2672         struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2673         return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2674 }
2675
2676 /*
2677  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2678  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2679  * all guests if the "nested" module option is off, and can also be disabled
2680  * for a single guest by disabling its VMX cpuid bit.
2681  */
2682 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2683 {
2684         return nested && guest_cpuid_has_vmx(vcpu);
2685 }
2686
2687 /*
2688  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2689  * returned for the various VMX controls MSRs when nested VMX is enabled.
2690  * The same values should also be used to verify that vmcs12 control fields are
2691  * valid during nested entry from L1 to L2.
2692  * Each of these control msrs has a low and high 32-bit half: A low bit is on
2693  * if the corresponding bit in the (32-bit) control field *must* be on, and a
2694  * bit in the high half is on if the corresponding bit in the control field
2695  * may be on. See also vmx_control_verify().
2696  */
2697 static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2698 {
2699         /*
2700          * Note that as a general rule, the high half of the MSRs (bits in
2701          * the control fields which may be 1) should be initialized by the
2702          * intersection of the underlying hardware's MSR (i.e., features which
2703          * can be supported) and the list of features we want to expose -
2704          * because they are known to be properly supported in our code.
2705          * Also, usually, the low half of the MSRs (bits which must be 1) can
2706          * be set to 0, meaning that L1 may turn off any of these bits. The
2707          * reason is that if one of these bits is necessary, it will appear
2708          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2709          * fields of vmcs01 and vmcs02, will turn these bits off - and
2710          * nested_vmx_exit_reflected() will not pass related exits to L1.
2711          * These rules have exceptions below.
2712          */
2713
2714         /* pin-based controls */
2715         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2716                 vmx->nested.nested_vmx_pinbased_ctls_low,
2717                 vmx->nested.nested_vmx_pinbased_ctls_high);
2718         vmx->nested.nested_vmx_pinbased_ctls_low |=
2719                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2720         vmx->nested.nested_vmx_pinbased_ctls_high &=
2721                 PIN_BASED_EXT_INTR_MASK |
2722                 PIN_BASED_NMI_EXITING |
2723                 PIN_BASED_VIRTUAL_NMIS;
2724         vmx->nested.nested_vmx_pinbased_ctls_high |=
2725                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2726                 PIN_BASED_VMX_PREEMPTION_TIMER;
2727         if (kvm_vcpu_apicv_active(&vmx->vcpu))
2728                 vmx->nested.nested_vmx_pinbased_ctls_high |=
2729                         PIN_BASED_POSTED_INTR;
2730
2731         /* exit controls */
2732         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2733                 vmx->nested.nested_vmx_exit_ctls_low,
2734                 vmx->nested.nested_vmx_exit_ctls_high);
2735         vmx->nested.nested_vmx_exit_ctls_low =
2736                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2737
2738         vmx->nested.nested_vmx_exit_ctls_high &=
2739 #ifdef CONFIG_X86_64
2740                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2741 #endif
2742                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2743         vmx->nested.nested_vmx_exit_ctls_high |=
2744                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2745                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2746                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2747
2748         if (kvm_mpx_supported())
2749                 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2750
2751         /* We support free control of debug control saving. */
2752         vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2753
2754         /* entry controls */
2755         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2756                 vmx->nested.nested_vmx_entry_ctls_low,
2757                 vmx->nested.nested_vmx_entry_ctls_high);
2758         vmx->nested.nested_vmx_entry_ctls_low =
2759                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2760         vmx->nested.nested_vmx_entry_ctls_high &=
2761 #ifdef CONFIG_X86_64
2762                 VM_ENTRY_IA32E_MODE |
2763 #endif
2764                 VM_ENTRY_LOAD_IA32_PAT;
2765         vmx->nested.nested_vmx_entry_ctls_high |=
2766                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2767         if (kvm_mpx_supported())
2768                 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2769
2770         /* We support free control of debug control loading. */
2771         vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2772
2773         /* cpu-based controls */
2774         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2775                 vmx->nested.nested_vmx_procbased_ctls_low,
2776                 vmx->nested.nested_vmx_procbased_ctls_high);
2777         vmx->nested.nested_vmx_procbased_ctls_low =
2778                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2779         vmx->nested.nested_vmx_procbased_ctls_high &=
2780                 CPU_BASED_VIRTUAL_INTR_PENDING |
2781                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2782                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2783                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2784                 CPU_BASED_CR3_STORE_EXITING |
2785 #ifdef CONFIG_X86_64
2786                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2787 #endif
2788                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2789                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
2790                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
2791                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
2792                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2793         /*
2794          * We can allow some features even when not supported by the
2795          * hardware. For example, L1 can specify an MSR bitmap - and we
2796          * can use it to avoid exits to L1 - even when L0 runs L2
2797          * without MSR bitmaps.
2798          */
2799         vmx->nested.nested_vmx_procbased_ctls_high |=
2800                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2801                 CPU_BASED_USE_MSR_BITMAPS;
2802
2803         /* We support free control of CR3 access interception. */
2804         vmx->nested.nested_vmx_procbased_ctls_low &=
2805                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2806
2807         /* secondary cpu-based controls */
2808         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2809                 vmx->nested.nested_vmx_secondary_ctls_low,
2810                 vmx->nested.nested_vmx_secondary_ctls_high);
2811         vmx->nested.nested_vmx_secondary_ctls_low = 0;
2812         vmx->nested.nested_vmx_secondary_ctls_high &=
2813                 SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
2814                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2815                 SECONDARY_EXEC_RDTSCP |
2816                 SECONDARY_EXEC_DESC |
2817                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2818                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2819                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2820                 SECONDARY_EXEC_WBINVD_EXITING |
2821                 SECONDARY_EXEC_XSAVES;
2822
2823         if (enable_ept) {
2824                 /* nested EPT: emulate EPT also to L1 */
2825                 vmx->nested.nested_vmx_secondary_ctls_high |=
2826                         SECONDARY_EXEC_ENABLE_EPT;
2827                 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2828                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2829                 if (cpu_has_vmx_ept_execute_only())
2830                         vmx->nested.nested_vmx_ept_caps |=
2831                                 VMX_EPT_EXECUTE_ONLY_BIT;
2832                 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2833                 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2834                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
2835                         VMX_EPT_1GB_PAGE_BIT;
2836                 if (enable_ept_ad_bits) {
2837                         vmx->nested.nested_vmx_secondary_ctls_high |=
2838                                 SECONDARY_EXEC_ENABLE_PML;
2839                         vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2840                 }
2841         } else
2842                 vmx->nested.nested_vmx_ept_caps = 0;
2843
2844         /*
2845          * Old versions of KVM use the single-context version without
2846          * checking for support, so declare that it is supported even
2847          * though it is treated as global context.  The alternative is
2848          * not failing the single-context invvpid, and it is worse.
2849          */
2850         if (enable_vpid) {
2851                 vmx->nested.nested_vmx_secondary_ctls_high |=
2852                         SECONDARY_EXEC_ENABLE_VPID;
2853                 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2854                         VMX_VPID_EXTENT_SUPPORTED_MASK;
2855         } else
2856                 vmx->nested.nested_vmx_vpid_caps = 0;
2857
2858         if (enable_unrestricted_guest)
2859                 vmx->nested.nested_vmx_secondary_ctls_high |=
2860                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
2861
2862         /* miscellaneous data */
2863         rdmsr(MSR_IA32_VMX_MISC,
2864                 vmx->nested.nested_vmx_misc_low,
2865                 vmx->nested.nested_vmx_misc_high);
2866         vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2867         vmx->nested.nested_vmx_misc_low |=
2868                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2869                 VMX_MISC_ACTIVITY_HLT;
2870         vmx->nested.nested_vmx_misc_high = 0;
2871
2872         /*
2873          * This MSR reports some information about VMX support. We
2874          * should return information about the VMX we emulate for the
2875          * guest, and the VMCS structure we give it - not about the
2876          * VMX support of the underlying hardware.
2877          */
2878         vmx->nested.nested_vmx_basic =
2879                 VMCS12_REVISION |
2880                 VMX_BASIC_TRUE_CTLS |
2881                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2882                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2883
2884         if (cpu_has_vmx_basic_inout())
2885                 vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
2886
2887         /*
2888          * These MSRs specify bits which the guest must keep fixed on
2889          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2890          * We picked the standard core2 setting.
2891          */
2892 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2893 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
2894         vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
2895         vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
2896
2897         /* These MSRs specify bits which the guest must keep fixed off. */
2898         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
2899         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
2900
2901         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2902         vmx->nested.nested_vmx_vmcs_enum = 0x2e;
2903 }
2904
2905 /*
2906  * if fixed0[i] == 1: val[i] must be 1
2907  * if fixed1[i] == 0: val[i] must be 0
2908  */
2909 static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
2910 {
2911         return ((val & fixed1) | fixed0) == val;
2912 }
2913
2914 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2915 {
2916         return fixed_bits_valid(control, low, high);
2917 }
2918
2919 static inline u64 vmx_control_msr(u32 low, u32 high)
2920 {
2921         return low | ((u64)high << 32);
2922 }
2923
2924 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
2925 {
2926         superset &= mask;
2927         subset &= mask;
2928
2929         return (superset | subset) == superset;
2930 }
2931
2932 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
2933 {
2934         const u64 feature_and_reserved =
2935                 /* feature (except bit 48; see below) */
2936                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
2937                 /* reserved */
2938                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
2939         u64 vmx_basic = vmx->nested.nested_vmx_basic;
2940
2941         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
2942                 return -EINVAL;
2943
2944         /*
2945          * KVM does not emulate a version of VMX that constrains physical
2946          * addresses of VMX structures (e.g. VMCS) to 32-bits.
2947          */
2948         if (data & BIT_ULL(48))
2949                 return -EINVAL;
2950
2951         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
2952             vmx_basic_vmcs_revision_id(data))
2953                 return -EINVAL;
2954
2955         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
2956                 return -EINVAL;
2957
2958         vmx->nested.nested_vmx_basic = data;
2959         return 0;
2960 }
2961
2962 static int
2963 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
2964 {
2965         u64 supported;
2966         u32 *lowp, *highp;
2967
2968         switch (msr_index) {
2969         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2970                 lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
2971                 highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
2972                 break;
2973         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2974                 lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
2975                 highp = &vmx->nested.nested_vmx_procbased_ctls_high;
2976                 break;
2977         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2978                 lowp = &vmx->nested.nested_vmx_exit_ctls_low;
2979                 highp = &vmx->nested.nested_vmx_exit_ctls_high;
2980                 break;
2981         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2982                 lowp = &vmx->nested.nested_vmx_entry_ctls_low;
2983                 highp = &vmx->nested.nested_vmx_entry_ctls_high;
2984                 break;
2985         case MSR_IA32_VMX_PROCBASED_CTLS2:
2986                 lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
2987                 highp = &vmx->nested.nested_vmx_secondary_ctls_high;
2988                 break;
2989         default:
2990                 BUG();
2991         }
2992
2993         supported = vmx_control_msr(*lowp, *highp);
2994
2995         /* Check must-be-1 bits are still 1. */
2996         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
2997                 return -EINVAL;
2998
2999         /* Check must-be-0 bits are still 0. */
3000         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3001                 return -EINVAL;
3002
3003         *lowp = data;
3004         *highp = data >> 32;
3005         return 0;
3006 }
3007
3008 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3009 {
3010         const u64 feature_and_reserved_bits =
3011                 /* feature */
3012                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3013                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3014                 /* reserved */
3015                 GENMASK_ULL(13, 9) | BIT_ULL(31);
3016         u64 vmx_misc;
3017
3018         vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
3019                                    vmx->nested.nested_vmx_misc_high);
3020
3021         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3022                 return -EINVAL;
3023
3024         if ((vmx->nested.nested_vmx_pinbased_ctls_high &
3025              PIN_BASED_VMX_PREEMPTION_TIMER) &&
3026             vmx_misc_preemption_timer_rate(data) !=
3027             vmx_misc_preemption_timer_rate(vmx_misc))
3028                 return -EINVAL;
3029
3030         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3031                 return -EINVAL;
3032
3033         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3034                 return -EINVAL;
3035
3036         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3037                 return -EINVAL;
3038
3039         vmx->nested.nested_vmx_misc_low = data;
3040         vmx->nested.nested_vmx_misc_high = data >> 32;
3041         return 0;
3042 }
3043
3044 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3045 {
3046         u64 vmx_ept_vpid_cap;
3047
3048         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
3049                                            vmx->nested.nested_vmx_vpid_caps);
3050
3051         /* Every bit is either reserved or a feature bit. */
3052         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3053                 return -EINVAL;
3054
3055         vmx->nested.nested_vmx_ept_caps = data;
3056         vmx->nested.nested_vmx_vpid_caps = data >> 32;
3057         return 0;
3058 }
3059
3060 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3061 {
3062         u64 *msr;
3063
3064         switch (msr_index) {
3065         case MSR_IA32_VMX_CR0_FIXED0:
3066                 msr = &vmx->nested.nested_vmx_cr0_fixed0;
3067                 break;
3068         case MSR_IA32_VMX_CR4_FIXED0:
3069                 msr = &vmx->nested.nested_vmx_cr4_fixed0;
3070                 break;
3071         default:
3072                 BUG();
3073         }
3074
3075         /*
3076          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3077          * must be 1 in the restored value.
3078          */
3079         if (!is_bitwise_subset(data, *msr, -1ULL))
3080                 return -EINVAL;
3081
3082         *msr = data;
3083         return 0;
3084 }
3085
3086 /*
3087  * Called when userspace is restoring VMX MSRs.
3088  *
3089  * Returns 0 on success, non-0 otherwise.
3090  */
3091 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3092 {
3093         struct vcpu_vmx *vmx = to_vmx(vcpu);
3094
3095         switch (msr_index) {
3096         case MSR_IA32_VMX_BASIC:
3097                 return vmx_restore_vmx_basic(vmx, data);
3098         case MSR_IA32_VMX_PINBASED_CTLS:
3099         case MSR_IA32_VMX_PROCBASED_CTLS:
3100         case MSR_IA32_VMX_EXIT_CTLS:
3101         case MSR_IA32_VMX_ENTRY_CTLS:
3102                 /*
3103                  * The "non-true" VMX capability MSRs are generated from the
3104                  * "true" MSRs, so we do not support restoring them directly.
3105                  *
3106                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3107                  * should restore the "true" MSRs with the must-be-1 bits
3108                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3109                  * DEFAULT SETTINGS".
3110                  */
3111                 return -EINVAL;
3112         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3113         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3114         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3115         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3116         case MSR_IA32_VMX_PROCBASED_CTLS2:
3117                 return vmx_restore_control_msr(vmx, msr_index, data);
3118         case MSR_IA32_VMX_MISC:
3119                 return vmx_restore_vmx_misc(vmx, data);
3120         case MSR_IA32_VMX_CR0_FIXED0:
3121         case MSR_IA32_VMX_CR4_FIXED0:
3122                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3123         case MSR_IA32_VMX_CR0_FIXED1:
3124         case MSR_IA32_VMX_CR4_FIXED1:
3125                 /*
3126                  * These MSRs are generated based on the vCPU's CPUID, so we
3127                  * do not support restoring them directly.
3128                  */
3129                 return -EINVAL;
3130         case MSR_IA32_VMX_EPT_VPID_CAP:
3131                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3132         case MSR_IA32_VMX_VMCS_ENUM:
3133                 vmx->nested.nested_vmx_vmcs_enum = data;
3134                 return 0;
3135         default:
3136                 /*
3137                  * The rest of the VMX capability MSRs do not support restore.
3138                  */
3139                 return -EINVAL;
3140         }
3141 }
3142
3143 /* Returns 0 on success, non-0 otherwise. */
3144 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
3145 {
3146         struct vcpu_vmx *vmx = to_vmx(vcpu);
3147
3148         switch (msr_index) {
3149         case MSR_IA32_VMX_BASIC:
3150                 *pdata = vmx->nested.nested_vmx_basic;
3151                 break;
3152         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3153         case MSR_IA32_VMX_PINBASED_CTLS:
3154                 *pdata = vmx_control_msr(
3155                         vmx->nested.nested_vmx_pinbased_ctls_low,
3156                         vmx->nested.nested_vmx_pinbased_ctls_high);
3157                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
3158                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3159                 break;
3160         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3161         case MSR_IA32_VMX_PROCBASED_CTLS:
3162                 *pdata = vmx_control_msr(
3163                         vmx->nested.nested_vmx_procbased_ctls_low,
3164                         vmx->nested.nested_vmx_procbased_ctls_high);
3165                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
3166                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3167                 break;
3168         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3169         case MSR_IA32_VMX_EXIT_CTLS:
3170                 *pdata = vmx_control_msr(
3171                         vmx->nested.nested_vmx_exit_ctls_low,
3172                         vmx->nested.nested_vmx_exit_ctls_high);
3173                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
3174                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3175                 break;
3176         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3177         case MSR_IA32_VMX_ENTRY_CTLS:
3178                 *pdata = vmx_control_msr(
3179                         vmx->nested.nested_vmx_entry_ctls_low,
3180                         vmx->nested.nested_vmx_entry_ctls_high);
3181                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
3182                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3183                 break;
3184         case MSR_IA32_VMX_MISC:
3185                 *pdata = vmx_control_msr(
3186                         vmx->nested.nested_vmx_misc_low,
3187                         vmx->nested.nested_vmx_misc_high);
3188                 break;
3189         case MSR_IA32_VMX_CR0_FIXED0:
3190                 *pdata = vmx->nested.nested_vmx_cr0_fixed0;
3191                 break;
3192         case MSR_IA32_VMX_CR0_FIXED1:
3193                 *pdata = vmx->nested.nested_vmx_cr0_fixed1;
3194                 break;
3195         case MSR_IA32_VMX_CR4_FIXED0:
3196                 *pdata = vmx->nested.nested_vmx_cr4_fixed0;
3197                 break;
3198         case MSR_IA32_VMX_CR4_FIXED1:
3199                 *pdata = vmx->nested.nested_vmx_cr4_fixed1;
3200                 break;
3201         case MSR_IA32_VMX_VMCS_ENUM:
3202                 *pdata = vmx->nested.nested_vmx_vmcs_enum;
3203                 break;
3204         case MSR_IA32_VMX_PROCBASED_CTLS2:
3205                 *pdata = vmx_control_msr(
3206                         vmx->nested.nested_vmx_secondary_ctls_low,
3207                         vmx->nested.nested_vmx_secondary_ctls_high);
3208                 break;
3209         case MSR_IA32_VMX_EPT_VPID_CAP:
3210                 *pdata = vmx->nested.nested_vmx_ept_caps |
3211                         ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
3212                 break;
3213         default:
3214                 return 1;
3215         }
3216
3217         return 0;
3218 }
3219
3220 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3221                                                  uint64_t val)
3222 {
3223         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
3224
3225         return !(val & ~valid_bits);
3226 }
3227
3228 /*
3229  * Reads an msr value (of 'msr_index') into 'pdata'.
3230  * Returns 0 on success, non-0 otherwise.
3231  * Assumes vcpu_load() was already called.
3232  */
3233 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3234 {
3235         struct shared_msr_entry *msr;
3236
3237         switch (msr_info->index) {
3238 #ifdef CONFIG_X86_64
3239         case MSR_FS_BASE:
3240                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
3241                 break;
3242         case MSR_GS_BASE:
3243                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
3244                 break;
3245         case MSR_KERNEL_GS_BASE:
3246                 vmx_load_host_state(to_vmx(vcpu));
3247                 msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
3248                 break;
3249 #endif
3250         case MSR_EFER:
3251                 return kvm_get_msr_common(vcpu, msr_info);
3252         case MSR_IA32_TSC:
3253                 msr_info->data = guest_read_tsc(vcpu);
3254                 break;
3255         case MSR_IA32_SYSENTER_CS:
3256                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
3257                 break;
3258         case MSR_IA32_SYSENTER_EIP:
3259                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
3260                 break;
3261         case MSR_IA32_SYSENTER_ESP:
3262                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
3263                 break;
3264         case MSR_IA32_BNDCFGS:
3265                 if (!kvm_mpx_supported() ||
3266                     (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
3267                         return 1;
3268                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3269                 break;
3270         case MSR_IA32_MCG_EXT_CTL:
3271                 if (!msr_info->host_initiated &&
3272                     !(to_vmx(vcpu)->msr_ia32_feature_control &
3273                       FEATURE_CONTROL_LMCE))
3274                         return 1;
3275                 msr_info->data = vcpu->arch.mcg_ext_ctl;
3276                 break;
3277         case MSR_IA32_FEATURE_CONTROL:
3278                 msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
3279                 break;
3280         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3281                 if (!nested_vmx_allowed(vcpu))
3282                         return 1;
3283                 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
3284         case MSR_IA32_XSS:
3285                 if (!vmx_xsaves_supported())
3286                         return 1;
3287                 msr_info->data = vcpu->arch.ia32_xss;
3288                 break;
3289         case MSR_TSC_AUX:
3290                 if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
3291                         return 1;
3292                 /* Otherwise falls through */
3293         default:
3294                 msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
3295                 if (msr) {
3296                         msr_info->data = msr->data;
3297                         break;
3298                 }
3299                 return kvm_get_msr_common(vcpu, msr_info);
3300         }
3301
3302         return 0;
3303 }
3304
3305 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
3306
3307 /*
3308  * Writes msr value into into the appropriate "register".
3309  * Returns 0 on success, non-0 otherwise.
3310  * Assumes vcpu_load() was already called.
3311  */
3312 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3313 {
3314         struct vcpu_vmx *vmx = to_vmx(vcpu);
3315         struct shared_msr_entry *msr;
3316         int ret = 0;
3317         u32 msr_index = msr_info->index;
3318         u64 data = msr_info->data;
3319
3320         switch (msr_index) {
3321         case MSR_EFER:
3322                 ret = kvm_set_msr_common(vcpu, msr_info);
3323                 break;
3324 #ifdef CONFIG_X86_64
3325         case MSR_FS_BASE:
3326                 vmx_segment_cache_clear(vmx);
3327                 vmcs_writel(GUEST_FS_BASE, data);
3328                 break;
3329         case MSR_GS_BASE:
3330                 vmx_segment_cache_clear(vmx);
3331                 vmcs_writel(GUEST_GS_BASE, data);
3332                 break;
3333         case MSR_KERNEL_GS_BASE:
3334                 vmx_load_host_state(vmx);
3335                 vmx->msr_guest_kernel_gs_base = data;
3336                 break;
3337 #endif
3338         case MSR_IA32_SYSENTER_CS:
3339                 vmcs_write32(GUEST_SYSENTER_CS, data);
3340                 break;
3341         case MSR_IA32_SYSENTER_EIP:
3342                 vmcs_writel(GUEST_SYSENTER_EIP, data);
3343                 break;
3344         case MSR_IA32_SYSENTER_ESP:
3345                 vmcs_writel(GUEST_SYSENTER_ESP, data);
3346                 break;
3347         case MSR_IA32_BNDCFGS:
3348                 if (!kvm_mpx_supported() ||
3349                     (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
3350                         return 1;
3351                 if (is_noncanonical_address(data & PAGE_MASK) ||
3352                     (data & MSR_IA32_BNDCFGS_RSVD))
3353                         return 1;
3354                 vmcs_write64(GUEST_BNDCFGS, data);
3355                 break;
3356         case MSR_IA32_TSC:
3357                 kvm_write_tsc(vcpu, msr_info);
3358                 break;
3359         case MSR_IA32_CR_PAT:
3360                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3361                         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3362                                 return 1;
3363                         vmcs_write64(GUEST_IA32_PAT, data);
3364                         vcpu->arch.pat = data;
3365                         break;
3366                 }
3367                 ret = kvm_set_msr_common(vcpu, msr_info);
3368                 break;
3369         case MSR_IA32_TSC_ADJUST:
3370                 ret = kvm_set_msr_common(vcpu, msr_info);
3371                 break;
3372         case MSR_IA32_MCG_EXT_CTL:
3373                 if ((!msr_info->host_initiated &&
3374                      !(to_vmx(vcpu)->msr_ia32_feature_control &
3375                        FEATURE_CONTROL_LMCE)) ||
3376                     (data & ~MCG_EXT_CTL_LMCE_EN))
3377                         return 1;
3378                 vcpu->arch.mcg_ext_ctl = data;
3379                 break;
3380         case MSR_IA32_FEATURE_CONTROL:
3381                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3382                     (to_vmx(vcpu)->msr_ia32_feature_control &
3383                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
3384                         return 1;
3385                 vmx->msr_ia32_feature_control = data;
3386                 if (msr_info->host_initiated && data == 0)
3387                         vmx_leave_nested(vcpu);
3388                 break;
3389         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3390                 if (!msr_info->host_initiated)
3391                         return 1; /* they are read-only */
3392                 if (!nested_vmx_allowed(vcpu))
3393                         return 1;
3394                 return vmx_set_vmx_msr(vcpu, msr_index, data);
3395         case MSR_IA32_XSS:
3396                 if (!vmx_xsaves_supported())
3397                         return 1;
3398                 /*
3399                  * The only supported bit as of Skylake is bit 8, but
3400                  * it is not supported on KVM.
3401                  */
3402                 if (data != 0)
3403                         return 1;
3404                 vcpu->arch.ia32_xss = data;
3405                 if (vcpu->arch.ia32_xss != host_xss)
3406                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
3407                                 vcpu->arch.ia32_xss, host_xss);
3408                 else
3409                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
3410                 break;
3411         case MSR_TSC_AUX:
3412                 if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
3413                         return 1;
3414                 /* Check reserved bit, higher 32 bits should be zero */
3415                 if ((data >> 32) != 0)
3416                         return 1;
3417                 /* Otherwise falls through */
3418         default:
3419                 msr = find_msr_entry(vmx, msr_index);
3420                 if (msr) {
3421                         u64 old_msr_data = msr->data;
3422                         msr->data = data;
3423                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
3424                                 preempt_disable();
3425                                 ret = kvm_set_shared_msr(msr->index, msr->data,
3426                                                          msr->mask);
3427                                 preempt_enable();
3428                                 if (ret)
3429                                         msr->data = old_msr_data;
3430                         }
3431                         break;
3432                 }
3433                 ret = kvm_set_msr_common(vcpu, msr_info);
3434         }
3435
3436         return ret;
3437 }
3438
3439 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
3440 {
3441         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
3442         switch (reg) {
3443         case VCPU_REGS_RSP:
3444                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
3445                 break;
3446         case VCPU_REGS_RIP:
3447                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
3448                 break;
3449         case VCPU_EXREG_PDPTR:
3450                 if (enable_ept)
3451                         ept_save_pdptrs(vcpu);
3452                 break;
3453         default:
3454                 break;
3455         }
3456 }
3457
3458 static __init int cpu_has_kvm_support(void)
3459 {
3460         return cpu_has_vmx();
3461 }
3462
3463 static __init int vmx_disabled_by_bios(void)
3464 {
3465         u64 msr;
3466
3467         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3468         if (msr & FEATURE_CONTROL_LOCKED) {
3469                 /* launched w/ TXT and VMX disabled */
3470                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3471                         && tboot_enabled())
3472                         return 1;
3473                 /* launched w/o TXT and VMX only enabled w/ TXT */
3474                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3475                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3476                         && !tboot_enabled()) {
3477                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3478                                 "activate TXT before enabling KVM\n");
3479                         return 1;
3480                 }
3481                 /* launched w/o TXT and VMX disabled */
3482                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3483                         && !tboot_enabled())
3484                         return 1;
3485         }
3486
3487         return 0;
3488 }
3489
3490 static void kvm_cpu_vmxon(u64 addr)
3491 {
3492         cr4_set_bits(X86_CR4_VMXE);
3493         intel_pt_handle_vmx(1);
3494
3495         asm volatile (ASM_VMX_VMXON_RAX
3496                         : : "a"(&addr), "m"(addr)
3497                         : "memory", "cc");
3498 }
3499
3500 static int hardware_enable(void)
3501 {
3502         int cpu = raw_smp_processor_id();
3503         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3504         u64 old, test_bits;
3505
3506         if (cr4_read_shadow() & X86_CR4_VMXE)
3507                 return -EBUSY;
3508
3509         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3510         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3511         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3512
3513         /*
3514          * Now we can enable the vmclear operation in kdump
3515          * since the loaded_vmcss_on_cpu list on this cpu
3516          * has been initialized.
3517          *
3518          * Though the cpu is not in VMX operation now, there
3519          * is no problem to enable the vmclear operation
3520          * for the loaded_vmcss_on_cpu list is empty!
3521          */
3522         crash_enable_local_vmclear(cpu);
3523
3524         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3525
3526         test_bits = FEATURE_CONTROL_LOCKED;
3527         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3528         if (tboot_enabled())
3529                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
3530
3531         if ((old & test_bits) != test_bits) {
3532                 /* enable and lock */
3533                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3534         }
3535         kvm_cpu_vmxon(phys_addr);
3536         ept_sync_global();
3537
3538         return 0;
3539 }
3540
3541 static void vmclear_local_loaded_vmcss(void)
3542 {
3543         int cpu = raw_smp_processor_id();
3544         struct loaded_vmcs *v, *n;
3545
3546         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3547                                  loaded_vmcss_on_cpu_link)
3548                 __loaded_vmcs_clear(v);
3549 }
3550
3551
3552 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
3553  * tricks.
3554  */
3555 static void kvm_cpu_vmxoff(void)
3556 {
3557         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
3558
3559         intel_pt_handle_vmx(0);
3560         cr4_clear_bits(X86_CR4_VMXE);
3561 }
3562
3563 static void hardware_disable(void)
3564 {
3565         vmclear_local_loaded_vmcss();
3566         kvm_cpu_vmxoff();
3567 }
3568
3569 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
3570                                       u32 msr, u32 *result)
3571 {
3572         u32 vmx_msr_low, vmx_msr_high;
3573         u32 ctl = ctl_min | ctl_opt;
3574
3575         rdmsr(msr, vmx_msr_low, vmx_msr_high);
3576
3577         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
3578         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
3579
3580         /* Ensure minimum (required) set of control bits are supported. */
3581         if (ctl_min & ~ctl)
3582                 return -EIO;
3583
3584         *result = ctl;
3585         return 0;
3586 }
3587
3588 static __init bool allow_1_setting(u32 msr, u32 ctl)
3589 {
3590         u32 vmx_msr_low, vmx_msr_high;
3591
3592         rdmsr(msr, vmx_msr_low, vmx_msr_high);
3593         return vmx_msr_high & ctl;
3594 }
3595
3596 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3597 {
3598         u32 vmx_msr_low, vmx_msr_high;
3599         u32 min, opt, min2, opt2;
3600         u32 _pin_based_exec_control = 0;
3601         u32 _cpu_based_exec_control = 0;
3602         u32 _cpu_based_2nd_exec_control = 0;
3603         u32 _vmexit_control = 0;
3604         u32 _vmentry_control = 0;
3605
3606         min = CPU_BASED_HLT_EXITING |
3607 #ifdef CONFIG_X86_64
3608               CPU_BASED_CR8_LOAD_EXITING |
3609               CPU_BASED_CR8_STORE_EXITING |
3610 #endif
3611               CPU_BASED_CR3_LOAD_EXITING |
3612               CPU_BASED_CR3_STORE_EXITING |
3613               CPU_BASED_USE_IO_BITMAPS |
3614               CPU_BASED_MOV_DR_EXITING |
3615               CPU_BASED_USE_TSC_OFFSETING |
3616               CPU_BASED_INVLPG_EXITING |
3617               CPU_BASED_RDPMC_EXITING;
3618
3619         if (!kvm_mwait_in_guest())
3620                 min |= CPU_BASED_MWAIT_EXITING |
3621                         CPU_BASED_MONITOR_EXITING;
3622
3623         opt = CPU_BASED_TPR_SHADOW |
3624               CPU_BASED_USE_MSR_BITMAPS |
3625               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3626         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3627                                 &_cpu_based_exec_control) < 0)
3628                 return -EIO;
3629 #ifdef CONFIG_X86_64
3630         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3631                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3632                                            ~CPU_BASED_CR8_STORE_EXITING;
3633 #endif
3634         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3635                 min2 = 0;
3636                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3637                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3638                         SECONDARY_EXEC_WBINVD_EXITING |
3639                         SECONDARY_EXEC_ENABLE_VPID |
3640                         SECONDARY_EXEC_ENABLE_EPT |
3641                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
3642                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3643                         SECONDARY_EXEC_RDTSCP |
3644                         SECONDARY_EXEC_ENABLE_INVPCID |
3645                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
3646                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3647                         SECONDARY_EXEC_SHADOW_VMCS |
3648                         SECONDARY_EXEC_XSAVES |
3649                         SECONDARY_EXEC_ENABLE_PML |
3650                         SECONDARY_EXEC_TSC_SCALING;
3651                 if (adjust_vmx_controls(min2, opt2,
3652                                         MSR_IA32_VMX_PROCBASED_CTLS2,
3653                                         &_cpu_based_2nd_exec_control) < 0)
3654                         return -EIO;
3655         }
3656 #ifndef CONFIG_X86_64
3657         if (!(_cpu_based_2nd_exec_control &
3658                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3659                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3660 #endif
3661
3662         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3663                 _cpu_based_2nd_exec_control &= ~(
3664                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3665                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3666                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3667
3668         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3669                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3670                    enabled */
3671                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3672                                              CPU_BASED_CR3_STORE_EXITING |
3673                                              CPU_BASED_INVLPG_EXITING);
3674                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
3675                       vmx_capability.ept, vmx_capability.vpid);
3676         }
3677
3678         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
3679 #ifdef CONFIG_X86_64
3680         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3681 #endif
3682         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
3683                 VM_EXIT_CLEAR_BNDCFGS;
3684         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3685                                 &_vmexit_control) < 0)
3686                 return -EIO;
3687
3688         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
3689                 PIN_BASED_VIRTUAL_NMIS;
3690         opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
3691         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3692                                 &_pin_based_exec_control) < 0)
3693                 return -EIO;
3694
3695         if (cpu_has_broken_vmx_preemption_timer())
3696                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3697         if (!(_cpu_based_2nd_exec_control &
3698                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
3699                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3700
3701         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3702         opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
3703         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3704                                 &_vmentry_control) < 0)
3705                 return -EIO;
3706
3707         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3708
3709         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3710         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3711                 return -EIO;
3712
3713 #ifdef CONFIG_X86_64
3714         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3715         if (vmx_msr_high & (1u<<16))
3716                 return -EIO;
3717 #endif
3718
3719         /* Require Write-Back (WB) memory type for VMCS accesses. */
3720         if (((vmx_msr_high >> 18) & 15) != 6)
3721                 return -EIO;
3722
3723         vmcs_conf->size = vmx_msr_high & 0x1fff;
3724         vmcs_conf->order = get_order(vmcs_conf->size);
3725         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
3726         vmcs_conf->revision_id = vmx_msr_low;
3727
3728         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3729         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3730         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3731         vmcs_conf->vmexit_ctrl         = _vmexit_control;
3732         vmcs_conf->vmentry_ctrl        = _vmentry_control;
3733
3734         cpu_has_load_ia32_efer =
3735                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3736                                 VM_ENTRY_LOAD_IA32_EFER)
3737                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3738                                    VM_EXIT_LOAD_IA32_EFER);
3739
3740         cpu_has_load_perf_global_ctrl =
3741                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3742                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
3743                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3744                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
3745
3746         /*
3747          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
3748          * but due to errata below it can't be used. Workaround is to use
3749          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3750          *
3751          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
3752          *
3753          * AAK155             (model 26)
3754          * AAP115             (model 30)
3755          * AAT100             (model 37)
3756          * BC86,AAY89,BD102   (model 44)
3757          * BA97               (model 46)
3758          *
3759          */
3760         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
3761                 switch (boot_cpu_data.x86_model) {
3762                 case 26:
3763                 case 30:
3764                 case 37:
3765                 case 44:
3766                 case 46:
3767                         cpu_has_load_perf_global_ctrl = false;
3768                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3769                                         "does not work properly. Using workaround\n");
3770                         break;
3771                 default:
3772                         break;
3773                 }
3774         }
3775
3776         if (boot_cpu_has(X86_FEATURE_XSAVES))
3777                 rdmsrl(MSR_IA32_XSS, host_xss);
3778
3779         return 0;
3780 }
3781
3782 static struct vmcs *alloc_vmcs_cpu(int cpu)
3783 {
3784         int node = cpu_to_node(cpu);
3785         struct page *pages;
3786         struct vmcs *vmcs;
3787
3788         pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
3789         if (!pages)
3790                 return NULL;
3791         vmcs = page_address(pages);
3792         memset(vmcs, 0, vmcs_config.size);
3793         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
3794         return vmcs;
3795 }
3796
3797 static struct vmcs *alloc_vmcs(void)
3798 {
3799         return alloc_vmcs_cpu(raw_smp_processor_id());
3800 }
3801
3802 static void free_vmcs(struct vmcs *vmcs)
3803 {
3804         free_pages((unsigned long)vmcs, vmcs_config.order);
3805 }
3806
3807 /*
3808  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3809  */
3810 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3811 {
3812         if (!loaded_vmcs->vmcs)
3813                 return;
3814         loaded_vmcs_clear(loaded_vmcs);
3815         free_vmcs(loaded_vmcs->vmcs);
3816         loaded_vmcs->vmcs = NULL;
3817         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3818 }
3819
3820 static void free_kvm_area(void)
3821 {
3822         int cpu;
3823
3824         for_each_possible_cpu(cpu) {
3825                 free_vmcs(per_cpu(vmxarea, cpu));
3826                 per_cpu(vmxarea, cpu) = NULL;
3827         }
3828 }
3829
3830 enum vmcs_field_type {
3831         VMCS_FIELD_TYPE_U16 = 0,
3832         VMCS_FIELD_TYPE_U64 = 1,
3833         VMCS_FIELD_TYPE_U32 = 2,
3834         VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
3835 };
3836
3837 static inline int vmcs_field_type(unsigned long field)
3838 {
3839         if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
3840                 return VMCS_FIELD_TYPE_U32;
3841         return (field >> 13) & 0x3 ;
3842 }
3843
3844 static inline int vmcs_field_readonly(unsigned long field)
3845 {
3846         return (((field >> 10) & 0x3) == 1);
3847 }
3848
3849 static void init_vmcs_shadow_fields(void)
3850 {
3851         int i, j;
3852
3853         /* No checks for read only fields yet */
3854
3855         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
3856                 switch (shadow_read_write_fields[i]) {
3857                 case GUEST_BNDCFGS:
3858                         if (!kvm_mpx_supported())
3859                                 continue;
3860                         break;
3861                 default:
3862                         break;
3863                 }
3864
3865                 if (j < i)
3866                         shadow_read_write_fields[j] =
3867                                 shadow_read_write_fields[i];
3868                 j++;
3869         }
3870         max_shadow_read_write_fields = j;
3871
3872         /* shadowed fields guest access without vmexit */
3873         for (i = 0; i < max_shadow_read_write_fields; i++) {
3874                 unsigned long field = shadow_read_write_fields[i];
3875
3876                 clear_bit(field, vmx_vmwrite_bitmap);
3877                 clear_bit(field, vmx_vmread_bitmap);
3878                 if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
3879                         clear_bit(field + 1, vmx_vmwrite_bitmap);
3880                         clear_bit(field + 1, vmx_vmread_bitmap);
3881                 }
3882         }
3883         for (i = 0; i < max_shadow_read_only_fields; i++) {
3884                 unsigned long field = shadow_read_only_fields[i];
3885
3886                 clear_bit(field, vmx_vmread_bitmap);
3887                 if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
3888                         clear_bit(field + 1, vmx_vmread_bitmap);
3889         }
3890 }
3891
3892 static __init int alloc_kvm_area(void)
3893 {
3894         int cpu;
3895
3896         for_each_possible_cpu(cpu) {
3897                 struct vmcs *vmcs;
3898
3899                 vmcs = alloc_vmcs_cpu(cpu);
3900                 if (!vmcs) {
3901                         free_kvm_area();
3902                         return -ENOMEM;
3903                 }
3904
3905                 per_cpu(vmxarea, cpu) = vmcs;
3906         }
3907         return 0;
3908 }
3909
3910 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3911                 struct kvm_segment *save)
3912 {
3913         if (!emulate_invalid_guest_state) {
3914                 /*
3915                  * CS and SS RPL should be equal during guest entry according
3916                  * to VMX spec, but in reality it is not always so. Since vcpu
3917                  * is in the middle of the transition from real mode to
3918                  * protected mode it is safe to assume that RPL 0 is a good
3919                  * default value.
3920                  */
3921                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3922                         save->selector &= ~SEGMENT_RPL_MASK;
3923                 save->dpl = save->selector & SEGMENT_RPL_MASK;
3924                 save->s = 1;
3925         }
3926         vmx_set_segment(vcpu, save, seg);
3927 }
3928
3929 static void enter_pmode(struct kvm_vcpu *vcpu)