KVM: x86: drop fpu_activate hook
[sfrench/cifs-2.6.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "cpuid.h"
22
23 #include <linux/kvm_host.h>
24 #include <linux/module.h>
25 #include <linux/kernel.h>
26 #include <linux/mm.h>
27 #include <linux/highmem.h>
28 #include <linux/sched.h>
29 #include <linux/moduleparam.h>
30 #include <linux/mod_devicetable.h>
31 #include <linux/ftrace_event.h>
32 #include <linux/slab.h>
33 #include <linux/tboot.h>
34 #include <linux/hrtimer.h>
35 #include "kvm_cache_regs.h"
36 #include "x86.h"
37
38 #include <asm/io.h>
39 #include <asm/desc.h>
40 #include <asm/vmx.h>
41 #include <asm/virtext.h>
42 #include <asm/mce.h>
43 #include <asm/i387.h>
44 #include <asm/xcr.h>
45 #include <asm/perf_event.h>
46 #include <asm/debugreg.h>
47 #include <asm/kexec.h>
48
49 #include "trace.h"
50
51 #define __ex(x) __kvm_handle_fault_on_reboot(x)
52 #define __ex_clear(x, reg) \
53         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
54
55 MODULE_AUTHOR("Qumranet");
56 MODULE_LICENSE("GPL");
57
58 static const struct x86_cpu_id vmx_cpu_id[] = {
59         X86_FEATURE_MATCH(X86_FEATURE_VMX),
60         {}
61 };
62 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
63
64 static bool __read_mostly enable_vpid = 1;
65 module_param_named(vpid, enable_vpid, bool, 0444);
66
67 static bool __read_mostly flexpriority_enabled = 1;
68 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
69
70 static bool __read_mostly enable_ept = 1;
71 module_param_named(ept, enable_ept, bool, S_IRUGO);
72
73 static bool __read_mostly enable_unrestricted_guest = 1;
74 module_param_named(unrestricted_guest,
75                         enable_unrestricted_guest, bool, S_IRUGO);
76
77 static bool __read_mostly enable_ept_ad_bits = 1;
78 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
79
80 static bool __read_mostly emulate_invalid_guest_state = true;
81 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
82
83 static bool __read_mostly vmm_exclusive = 1;
84 module_param(vmm_exclusive, bool, S_IRUGO);
85
86 static bool __read_mostly fasteoi = 1;
87 module_param(fasteoi, bool, S_IRUGO);
88
89 static bool __read_mostly enable_apicv = 1;
90 module_param(enable_apicv, bool, S_IRUGO);
91
92 static bool __read_mostly enable_shadow_vmcs = 1;
93 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
94 /*
95  * If nested=1, nested virtualization is supported, i.e., guests may use
96  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
97  * use VMX instructions.
98  */
99 static bool __read_mostly nested = 0;
100 module_param(nested, bool, S_IRUGO);
101
102 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
103 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
104 #define KVM_VM_CR0_ALWAYS_ON                                            \
105         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
106 #define KVM_CR4_GUEST_OWNED_BITS                                      \
107         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
108          | X86_CR4_OSXMMEXCPT)
109
110 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
111 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
112
113 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
114
115 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
116
117 /*
118  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
119  * ple_gap:    upper bound on the amount of time between two successive
120  *             executions of PAUSE in a loop. Also indicate if ple enabled.
121  *             According to test, this time is usually smaller than 128 cycles.
122  * ple_window: upper bound on the amount of time a guest is allowed to execute
123  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
124  *             less than 2^12 cycles
125  * Time is measured based on a counter that runs at the same rate as the TSC,
126  * refer SDM volume 3b section 21.6.13 & 22.1.3.
127  */
128 #define KVM_VMX_DEFAULT_PLE_GAP    128
129 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096
130 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
131 module_param(ple_gap, int, S_IRUGO);
132
133 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
134 module_param(ple_window, int, S_IRUGO);
135
136 extern const ulong vmx_return;
137
138 #define NR_AUTOLOAD_MSRS 8
139 #define VMCS02_POOL_SIZE 1
140
141 struct vmcs {
142         u32 revision_id;
143         u32 abort;
144         char data[0];
145 };
146
147 /*
148  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
149  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
150  * loaded on this CPU (so we can clear them if the CPU goes down).
151  */
152 struct loaded_vmcs {
153         struct vmcs *vmcs;
154         int cpu;
155         int launched;
156         struct list_head loaded_vmcss_on_cpu_link;
157 };
158
159 struct shared_msr_entry {
160         unsigned index;
161         u64 data;
162         u64 mask;
163 };
164
165 /*
166  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
167  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
168  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
169  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
170  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
171  * More than one of these structures may exist, if L1 runs multiple L2 guests.
172  * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
173  * underlying hardware which will be used to run L2.
174  * This structure is packed to ensure that its layout is identical across
175  * machines (necessary for live migration).
176  * If there are changes in this struct, VMCS12_REVISION must be changed.
177  */
178 typedef u64 natural_width;
179 struct __packed vmcs12 {
180         /* According to the Intel spec, a VMCS region must start with the
181          * following two fields. Then follow implementation-specific data.
182          */
183         u32 revision_id;
184         u32 abort;
185
186         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
187         u32 padding[7]; /* room for future expansion */
188
189         u64 io_bitmap_a;
190         u64 io_bitmap_b;
191         u64 msr_bitmap;
192         u64 vm_exit_msr_store_addr;
193         u64 vm_exit_msr_load_addr;
194         u64 vm_entry_msr_load_addr;
195         u64 tsc_offset;
196         u64 virtual_apic_page_addr;
197         u64 apic_access_addr;
198         u64 ept_pointer;
199         u64 guest_physical_address;
200         u64 vmcs_link_pointer;
201         u64 guest_ia32_debugctl;
202         u64 guest_ia32_pat;
203         u64 guest_ia32_efer;
204         u64 guest_ia32_perf_global_ctrl;
205         u64 guest_pdptr0;
206         u64 guest_pdptr1;
207         u64 guest_pdptr2;
208         u64 guest_pdptr3;
209         u64 guest_bndcfgs;
210         u64 host_ia32_pat;
211         u64 host_ia32_efer;
212         u64 host_ia32_perf_global_ctrl;
213         u64 padding64[8]; /* room for future expansion */
214         /*
215          * To allow migration of L1 (complete with its L2 guests) between
216          * machines of different natural widths (32 or 64 bit), we cannot have
217          * unsigned long fields with no explict size. We use u64 (aliased
218          * natural_width) instead. Luckily, x86 is little-endian.
219          */
220         natural_width cr0_guest_host_mask;
221         natural_width cr4_guest_host_mask;
222         natural_width cr0_read_shadow;
223         natural_width cr4_read_shadow;
224         natural_width cr3_target_value0;
225         natural_width cr3_target_value1;
226         natural_width cr3_target_value2;
227         natural_width cr3_target_value3;
228         natural_width exit_qualification;
229         natural_width guest_linear_address;
230         natural_width guest_cr0;
231         natural_width guest_cr3;
232         natural_width guest_cr4;
233         natural_width guest_es_base;
234         natural_width guest_cs_base;
235         natural_width guest_ss_base;
236         natural_width guest_ds_base;
237         natural_width guest_fs_base;
238         natural_width guest_gs_base;
239         natural_width guest_ldtr_base;
240         natural_width guest_tr_base;
241         natural_width guest_gdtr_base;
242         natural_width guest_idtr_base;
243         natural_width guest_dr7;
244         natural_width guest_rsp;
245         natural_width guest_rip;
246         natural_width guest_rflags;
247         natural_width guest_pending_dbg_exceptions;
248         natural_width guest_sysenter_esp;
249         natural_width guest_sysenter_eip;
250         natural_width host_cr0;
251         natural_width host_cr3;
252         natural_width host_cr4;
253         natural_width host_fs_base;
254         natural_width host_gs_base;
255         natural_width host_tr_base;
256         natural_width host_gdtr_base;
257         natural_width host_idtr_base;
258         natural_width host_ia32_sysenter_esp;
259         natural_width host_ia32_sysenter_eip;
260         natural_width host_rsp;
261         natural_width host_rip;
262         natural_width paddingl[8]; /* room for future expansion */
263         u32 pin_based_vm_exec_control;
264         u32 cpu_based_vm_exec_control;
265         u32 exception_bitmap;
266         u32 page_fault_error_code_mask;
267         u32 page_fault_error_code_match;
268         u32 cr3_target_count;
269         u32 vm_exit_controls;
270         u32 vm_exit_msr_store_count;
271         u32 vm_exit_msr_load_count;
272         u32 vm_entry_controls;
273         u32 vm_entry_msr_load_count;
274         u32 vm_entry_intr_info_field;
275         u32 vm_entry_exception_error_code;
276         u32 vm_entry_instruction_len;
277         u32 tpr_threshold;
278         u32 secondary_vm_exec_control;
279         u32 vm_instruction_error;
280         u32 vm_exit_reason;
281         u32 vm_exit_intr_info;
282         u32 vm_exit_intr_error_code;
283         u32 idt_vectoring_info_field;
284         u32 idt_vectoring_error_code;
285         u32 vm_exit_instruction_len;
286         u32 vmx_instruction_info;
287         u32 guest_es_limit;
288         u32 guest_cs_limit;
289         u32 guest_ss_limit;
290         u32 guest_ds_limit;
291         u32 guest_fs_limit;
292         u32 guest_gs_limit;
293         u32 guest_ldtr_limit;
294         u32 guest_tr_limit;
295         u32 guest_gdtr_limit;
296         u32 guest_idtr_limit;
297         u32 guest_es_ar_bytes;
298         u32 guest_cs_ar_bytes;
299         u32 guest_ss_ar_bytes;
300         u32 guest_ds_ar_bytes;
301         u32 guest_fs_ar_bytes;
302         u32 guest_gs_ar_bytes;
303         u32 guest_ldtr_ar_bytes;
304         u32 guest_tr_ar_bytes;
305         u32 guest_interruptibility_info;
306         u32 guest_activity_state;
307         u32 guest_sysenter_cs;
308         u32 host_ia32_sysenter_cs;
309         u32 vmx_preemption_timer_value;
310         u32 padding32[7]; /* room for future expansion */
311         u16 virtual_processor_id;
312         u16 guest_es_selector;
313         u16 guest_cs_selector;
314         u16 guest_ss_selector;
315         u16 guest_ds_selector;
316         u16 guest_fs_selector;
317         u16 guest_gs_selector;
318         u16 guest_ldtr_selector;
319         u16 guest_tr_selector;
320         u16 host_es_selector;
321         u16 host_cs_selector;
322         u16 host_ss_selector;
323         u16 host_ds_selector;
324         u16 host_fs_selector;
325         u16 host_gs_selector;
326         u16 host_tr_selector;
327 };
328
329 /*
330  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
331  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
332  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
333  */
334 #define VMCS12_REVISION 0x11e57ed0
335
336 /*
337  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
338  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
339  * current implementation, 4K are reserved to avoid future complications.
340  */
341 #define VMCS12_SIZE 0x1000
342
343 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
344 struct vmcs02_list {
345         struct list_head list;
346         gpa_t vmptr;
347         struct loaded_vmcs vmcs02;
348 };
349
350 /*
351  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
352  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
353  */
354 struct nested_vmx {
355         /* Has the level1 guest done vmxon? */
356         bool vmxon;
357         gpa_t vmxon_ptr;
358
359         /* The guest-physical address of the current VMCS L1 keeps for L2 */
360         gpa_t current_vmptr;
361         /* The host-usable pointer to the above */
362         struct page *current_vmcs12_page;
363         struct vmcs12 *current_vmcs12;
364         struct vmcs *current_shadow_vmcs;
365         /*
366          * Indicates if the shadow vmcs must be updated with the
367          * data hold by vmcs12
368          */
369         bool sync_shadow_vmcs;
370
371         /* vmcs02_list cache of VMCSs recently used to run L2 guests */
372         struct list_head vmcs02_pool;
373         int vmcs02_num;
374         u64 vmcs01_tsc_offset;
375         /* L2 must run next, and mustn't decide to exit to L1. */
376         bool nested_run_pending;
377         /*
378          * Guest pages referred to in vmcs02 with host-physical pointers, so
379          * we must keep them pinned while L2 runs.
380          */
381         struct page *apic_access_page;
382         u64 msr_ia32_feature_control;
383
384         struct hrtimer preemption_timer;
385         bool preemption_timer_expired;
386
387         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
388         u64 vmcs01_debugctl;
389 };
390
391 #define POSTED_INTR_ON  0
392 /* Posted-Interrupt Descriptor */
393 struct pi_desc {
394         u32 pir[8];     /* Posted interrupt requested */
395         u32 control;    /* bit 0 of control is outstanding notification bit */
396         u32 rsvd[7];
397 } __aligned(64);
398
399 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
400 {
401         return test_and_set_bit(POSTED_INTR_ON,
402                         (unsigned long *)&pi_desc->control);
403 }
404
405 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
406 {
407         return test_and_clear_bit(POSTED_INTR_ON,
408                         (unsigned long *)&pi_desc->control);
409 }
410
411 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
412 {
413         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
414 }
415
416 struct vcpu_vmx {
417         struct kvm_vcpu       vcpu;
418         unsigned long         host_rsp;
419         u8                    fail;
420         bool                  nmi_known_unmasked;
421         u32                   exit_intr_info;
422         u32                   idt_vectoring_info;
423         ulong                 rflags;
424         struct shared_msr_entry *guest_msrs;
425         int                   nmsrs;
426         int                   save_nmsrs;
427         unsigned long         host_idt_base;
428 #ifdef CONFIG_X86_64
429         u64                   msr_host_kernel_gs_base;
430         u64                   msr_guest_kernel_gs_base;
431 #endif
432         u32 vm_entry_controls_shadow;
433         u32 vm_exit_controls_shadow;
434         /*
435          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
436          * non-nested (L1) guest, it always points to vmcs01. For a nested
437          * guest (L2), it points to a different VMCS.
438          */
439         struct loaded_vmcs    vmcs01;
440         struct loaded_vmcs   *loaded_vmcs;
441         bool                  __launched; /* temporary, used in vmx_vcpu_run */
442         struct msr_autoload {
443                 unsigned nr;
444                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
445                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
446         } msr_autoload;
447         struct {
448                 int           loaded;
449                 u16           fs_sel, gs_sel, ldt_sel;
450 #ifdef CONFIG_X86_64
451                 u16           ds_sel, es_sel;
452 #endif
453                 int           gs_ldt_reload_needed;
454                 int           fs_reload_needed;
455                 u64           msr_host_bndcfgs;
456         } host_state;
457         struct {
458                 int vm86_active;
459                 ulong save_rflags;
460                 struct kvm_segment segs[8];
461         } rmode;
462         struct {
463                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
464                 struct kvm_save_segment {
465                         u16 selector;
466                         unsigned long base;
467                         u32 limit;
468                         u32 ar;
469                 } seg[8];
470         } segment_cache;
471         int vpid;
472         bool emulation_required;
473
474         /* Support for vnmi-less CPUs */
475         int soft_vnmi_blocked;
476         ktime_t entry_time;
477         s64 vnmi_blocked_time;
478         u32 exit_reason;
479
480         bool rdtscp_enabled;
481
482         /* Posted interrupt descriptor */
483         struct pi_desc pi_desc;
484
485         /* Support for a guest hypervisor (nested VMX) */
486         struct nested_vmx nested;
487 };
488
489 enum segment_cache_field {
490         SEG_FIELD_SEL = 0,
491         SEG_FIELD_BASE = 1,
492         SEG_FIELD_LIMIT = 2,
493         SEG_FIELD_AR = 3,
494
495         SEG_FIELD_NR = 4
496 };
497
498 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
499 {
500         return container_of(vcpu, struct vcpu_vmx, vcpu);
501 }
502
503 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
504 #define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
505 #define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
506                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
507
508
509 static unsigned long shadow_read_only_fields[] = {
510         /*
511          * We do NOT shadow fields that are modified when L0
512          * traps and emulates any vmx instruction (e.g. VMPTRLD,
513          * VMXON...) executed by L1.
514          * For example, VM_INSTRUCTION_ERROR is read
515          * by L1 if a vmx instruction fails (part of the error path).
516          * Note the code assumes this logic. If for some reason
517          * we start shadowing these fields then we need to
518          * force a shadow sync when L0 emulates vmx instructions
519          * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
520          * by nested_vmx_failValid)
521          */
522         VM_EXIT_REASON,
523         VM_EXIT_INTR_INFO,
524         VM_EXIT_INSTRUCTION_LEN,
525         IDT_VECTORING_INFO_FIELD,
526         IDT_VECTORING_ERROR_CODE,
527         VM_EXIT_INTR_ERROR_CODE,
528         EXIT_QUALIFICATION,
529         GUEST_LINEAR_ADDRESS,
530         GUEST_PHYSICAL_ADDRESS
531 };
532 static int max_shadow_read_only_fields =
533         ARRAY_SIZE(shadow_read_only_fields);
534
535 static unsigned long shadow_read_write_fields[] = {
536         GUEST_RIP,
537         GUEST_RSP,
538         GUEST_CR0,
539         GUEST_CR3,
540         GUEST_CR4,
541         GUEST_INTERRUPTIBILITY_INFO,
542         GUEST_RFLAGS,
543         GUEST_CS_SELECTOR,
544         GUEST_CS_AR_BYTES,
545         GUEST_CS_LIMIT,
546         GUEST_CS_BASE,
547         GUEST_ES_BASE,
548         GUEST_BNDCFGS,
549         CR0_GUEST_HOST_MASK,
550         CR0_READ_SHADOW,
551         CR4_READ_SHADOW,
552         TSC_OFFSET,
553         EXCEPTION_BITMAP,
554         CPU_BASED_VM_EXEC_CONTROL,
555         VM_ENTRY_EXCEPTION_ERROR_CODE,
556         VM_ENTRY_INTR_INFO_FIELD,
557         VM_ENTRY_INSTRUCTION_LEN,
558         VM_ENTRY_EXCEPTION_ERROR_CODE,
559         HOST_FS_BASE,
560         HOST_GS_BASE,
561         HOST_FS_SELECTOR,
562         HOST_GS_SELECTOR
563 };
564 static int max_shadow_read_write_fields =
565         ARRAY_SIZE(shadow_read_write_fields);
566
567 static const unsigned short vmcs_field_to_offset_table[] = {
568         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
569         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
570         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
571         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
572         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
573         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
574         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
575         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
576         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
577         FIELD(HOST_ES_SELECTOR, host_es_selector),
578         FIELD(HOST_CS_SELECTOR, host_cs_selector),
579         FIELD(HOST_SS_SELECTOR, host_ss_selector),
580         FIELD(HOST_DS_SELECTOR, host_ds_selector),
581         FIELD(HOST_FS_SELECTOR, host_fs_selector),
582         FIELD(HOST_GS_SELECTOR, host_gs_selector),
583         FIELD(HOST_TR_SELECTOR, host_tr_selector),
584         FIELD64(IO_BITMAP_A, io_bitmap_a),
585         FIELD64(IO_BITMAP_B, io_bitmap_b),
586         FIELD64(MSR_BITMAP, msr_bitmap),
587         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
588         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
589         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
590         FIELD64(TSC_OFFSET, tsc_offset),
591         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
592         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
593         FIELD64(EPT_POINTER, ept_pointer),
594         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
595         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
596         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
597         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
598         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
599         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
600         FIELD64(GUEST_PDPTR0, guest_pdptr0),
601         FIELD64(GUEST_PDPTR1, guest_pdptr1),
602         FIELD64(GUEST_PDPTR2, guest_pdptr2),
603         FIELD64(GUEST_PDPTR3, guest_pdptr3),
604         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
605         FIELD64(HOST_IA32_PAT, host_ia32_pat),
606         FIELD64(HOST_IA32_EFER, host_ia32_efer),
607         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
608         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
609         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
610         FIELD(EXCEPTION_BITMAP, exception_bitmap),
611         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
612         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
613         FIELD(CR3_TARGET_COUNT, cr3_target_count),
614         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
615         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
616         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
617         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
618         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
619         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
620         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
621         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
622         FIELD(TPR_THRESHOLD, tpr_threshold),
623         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
624         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
625         FIELD(VM_EXIT_REASON, vm_exit_reason),
626         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
627         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
628         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
629         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
630         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
631         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
632         FIELD(GUEST_ES_LIMIT, guest_es_limit),
633         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
634         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
635         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
636         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
637         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
638         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
639         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
640         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
641         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
642         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
643         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
644         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
645         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
646         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
647         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
648         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
649         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
650         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
651         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
652         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
653         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
654         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
655         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
656         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
657         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
658         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
659         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
660         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
661         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
662         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
663         FIELD(EXIT_QUALIFICATION, exit_qualification),
664         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
665         FIELD(GUEST_CR0, guest_cr0),
666         FIELD(GUEST_CR3, guest_cr3),
667         FIELD(GUEST_CR4, guest_cr4),
668         FIELD(GUEST_ES_BASE, guest_es_base),
669         FIELD(GUEST_CS_BASE, guest_cs_base),
670         FIELD(GUEST_SS_BASE, guest_ss_base),
671         FIELD(GUEST_DS_BASE, guest_ds_base),
672         FIELD(GUEST_FS_BASE, guest_fs_base),
673         FIELD(GUEST_GS_BASE, guest_gs_base),
674         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
675         FIELD(GUEST_TR_BASE, guest_tr_base),
676         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
677         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
678         FIELD(GUEST_DR7, guest_dr7),
679         FIELD(GUEST_RSP, guest_rsp),
680         FIELD(GUEST_RIP, guest_rip),
681         FIELD(GUEST_RFLAGS, guest_rflags),
682         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
683         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
684         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
685         FIELD(HOST_CR0, host_cr0),
686         FIELD(HOST_CR3, host_cr3),
687         FIELD(HOST_CR4, host_cr4),
688         FIELD(HOST_FS_BASE, host_fs_base),
689         FIELD(HOST_GS_BASE, host_gs_base),
690         FIELD(HOST_TR_BASE, host_tr_base),
691         FIELD(HOST_GDTR_BASE, host_gdtr_base),
692         FIELD(HOST_IDTR_BASE, host_idtr_base),
693         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
694         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
695         FIELD(HOST_RSP, host_rsp),
696         FIELD(HOST_RIP, host_rip),
697 };
698 static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
699
700 static inline short vmcs_field_to_offset(unsigned long field)
701 {
702         if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
703                 return -1;
704         return vmcs_field_to_offset_table[field];
705 }
706
707 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
708 {
709         return to_vmx(vcpu)->nested.current_vmcs12;
710 }
711
712 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
713 {
714         struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
715         if (is_error_page(page))
716                 return NULL;
717
718         return page;
719 }
720
721 static void nested_release_page(struct page *page)
722 {
723         kvm_release_page_dirty(page);
724 }
725
726 static void nested_release_page_clean(struct page *page)
727 {
728         kvm_release_page_clean(page);
729 }
730
731 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
732 static u64 construct_eptp(unsigned long root_hpa);
733 static void kvm_cpu_vmxon(u64 addr);
734 static void kvm_cpu_vmxoff(void);
735 static bool vmx_mpx_supported(void);
736 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
737 static void vmx_set_segment(struct kvm_vcpu *vcpu,
738                             struct kvm_segment *var, int seg);
739 static void vmx_get_segment(struct kvm_vcpu *vcpu,
740                             struct kvm_segment *var, int seg);
741 static bool guest_state_valid(struct kvm_vcpu *vcpu);
742 static u32 vmx_segment_access_rights(struct kvm_segment *var);
743 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
744 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
745 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
746
747 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
748 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
749 /*
750  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
751  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
752  */
753 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
754 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
755
756 static unsigned long *vmx_io_bitmap_a;
757 static unsigned long *vmx_io_bitmap_b;
758 static unsigned long *vmx_msr_bitmap_legacy;
759 static unsigned long *vmx_msr_bitmap_longmode;
760 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
761 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
762 static unsigned long *vmx_vmread_bitmap;
763 static unsigned long *vmx_vmwrite_bitmap;
764
765 static bool cpu_has_load_ia32_efer;
766 static bool cpu_has_load_perf_global_ctrl;
767
768 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
769 static DEFINE_SPINLOCK(vmx_vpid_lock);
770
771 static struct vmcs_config {
772         int size;
773         int order;
774         u32 revision_id;
775         u32 pin_based_exec_ctrl;
776         u32 cpu_based_exec_ctrl;
777         u32 cpu_based_2nd_exec_ctrl;
778         u32 vmexit_ctrl;
779         u32 vmentry_ctrl;
780 } vmcs_config;
781
782 static struct vmx_capability {
783         u32 ept;
784         u32 vpid;
785 } vmx_capability;
786
787 #define VMX_SEGMENT_FIELD(seg)                                  \
788         [VCPU_SREG_##seg] = {                                   \
789                 .selector = GUEST_##seg##_SELECTOR,             \
790                 .base = GUEST_##seg##_BASE,                     \
791                 .limit = GUEST_##seg##_LIMIT,                   \
792                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
793         }
794
795 static const struct kvm_vmx_segment_field {
796         unsigned selector;
797         unsigned base;
798         unsigned limit;
799         unsigned ar_bytes;
800 } kvm_vmx_segment_fields[] = {
801         VMX_SEGMENT_FIELD(CS),
802         VMX_SEGMENT_FIELD(DS),
803         VMX_SEGMENT_FIELD(ES),
804         VMX_SEGMENT_FIELD(FS),
805         VMX_SEGMENT_FIELD(GS),
806         VMX_SEGMENT_FIELD(SS),
807         VMX_SEGMENT_FIELD(TR),
808         VMX_SEGMENT_FIELD(LDTR),
809 };
810
811 static u64 host_efer;
812
813 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
814
815 /*
816  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
817  * away by decrementing the array size.
818  */
819 static const u32 vmx_msr_index[] = {
820 #ifdef CONFIG_X86_64
821         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
822 #endif
823         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
824 };
825
826 static inline bool is_page_fault(u32 intr_info)
827 {
828         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
829                              INTR_INFO_VALID_MASK)) ==
830                 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
831 }
832
833 static inline bool is_no_device(u32 intr_info)
834 {
835         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
836                              INTR_INFO_VALID_MASK)) ==
837                 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
838 }
839
840 static inline bool is_invalid_opcode(u32 intr_info)
841 {
842         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
843                              INTR_INFO_VALID_MASK)) ==
844                 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
845 }
846
847 static inline bool is_external_interrupt(u32 intr_info)
848 {
849         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
850                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
851 }
852
853 static inline bool is_machine_check(u32 intr_info)
854 {
855         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
856                              INTR_INFO_VALID_MASK)) ==
857                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
858 }
859
860 static inline bool cpu_has_vmx_msr_bitmap(void)
861 {
862         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
863 }
864
865 static inline bool cpu_has_vmx_tpr_shadow(void)
866 {
867         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
868 }
869
870 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
871 {
872         return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
873 }
874
875 static inline bool cpu_has_secondary_exec_ctrls(void)
876 {
877         return vmcs_config.cpu_based_exec_ctrl &
878                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
879 }
880
881 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
882 {
883         return vmcs_config.cpu_based_2nd_exec_ctrl &
884                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
885 }
886
887 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
888 {
889         return vmcs_config.cpu_based_2nd_exec_ctrl &
890                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
891 }
892
893 static inline bool cpu_has_vmx_apic_register_virt(void)
894 {
895         return vmcs_config.cpu_based_2nd_exec_ctrl &
896                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
897 }
898
899 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
900 {
901         return vmcs_config.cpu_based_2nd_exec_ctrl &
902                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
903 }
904
905 static inline bool cpu_has_vmx_posted_intr(void)
906 {
907         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
908 }
909
910 static inline bool cpu_has_vmx_apicv(void)
911 {
912         return cpu_has_vmx_apic_register_virt() &&
913                 cpu_has_vmx_virtual_intr_delivery() &&
914                 cpu_has_vmx_posted_intr();
915 }
916
917 static inline bool cpu_has_vmx_flexpriority(void)
918 {
919         return cpu_has_vmx_tpr_shadow() &&
920                 cpu_has_vmx_virtualize_apic_accesses();
921 }
922
923 static inline bool cpu_has_vmx_ept_execute_only(void)
924 {
925         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
926 }
927
928 static inline bool cpu_has_vmx_eptp_uncacheable(void)
929 {
930         return vmx_capability.ept & VMX_EPTP_UC_BIT;
931 }
932
933 static inline bool cpu_has_vmx_eptp_writeback(void)
934 {
935         return vmx_capability.ept & VMX_EPTP_WB_BIT;
936 }
937
938 static inline bool cpu_has_vmx_ept_2m_page(void)
939 {
940         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
941 }
942
943 static inline bool cpu_has_vmx_ept_1g_page(void)
944 {
945         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
946 }
947
948 static inline bool cpu_has_vmx_ept_4levels(void)
949 {
950         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
951 }
952
953 static inline bool cpu_has_vmx_ept_ad_bits(void)
954 {
955         return vmx_capability.ept & VMX_EPT_AD_BIT;
956 }
957
958 static inline bool cpu_has_vmx_invept_context(void)
959 {
960         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
961 }
962
963 static inline bool cpu_has_vmx_invept_global(void)
964 {
965         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
966 }
967
968 static inline bool cpu_has_vmx_invvpid_single(void)
969 {
970         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
971 }
972
973 static inline bool cpu_has_vmx_invvpid_global(void)
974 {
975         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
976 }
977
978 static inline bool cpu_has_vmx_ept(void)
979 {
980         return vmcs_config.cpu_based_2nd_exec_ctrl &
981                 SECONDARY_EXEC_ENABLE_EPT;
982 }
983
984 static inline bool cpu_has_vmx_unrestricted_guest(void)
985 {
986         return vmcs_config.cpu_based_2nd_exec_ctrl &
987                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
988 }
989
990 static inline bool cpu_has_vmx_ple(void)
991 {
992         return vmcs_config.cpu_based_2nd_exec_ctrl &
993                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
994 }
995
996 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
997 {
998         return flexpriority_enabled && irqchip_in_kernel(kvm);
999 }
1000
1001 static inline bool cpu_has_vmx_vpid(void)
1002 {
1003         return vmcs_config.cpu_based_2nd_exec_ctrl &
1004                 SECONDARY_EXEC_ENABLE_VPID;
1005 }
1006
1007 static inline bool cpu_has_vmx_rdtscp(void)
1008 {
1009         return vmcs_config.cpu_based_2nd_exec_ctrl &
1010                 SECONDARY_EXEC_RDTSCP;
1011 }
1012
1013 static inline bool cpu_has_vmx_invpcid(void)
1014 {
1015         return vmcs_config.cpu_based_2nd_exec_ctrl &
1016                 SECONDARY_EXEC_ENABLE_INVPCID;
1017 }
1018
1019 static inline bool cpu_has_virtual_nmis(void)
1020 {
1021         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1022 }
1023
1024 static inline bool cpu_has_vmx_wbinvd_exit(void)
1025 {
1026         return vmcs_config.cpu_based_2nd_exec_ctrl &
1027                 SECONDARY_EXEC_WBINVD_EXITING;
1028 }
1029
1030 static inline bool cpu_has_vmx_shadow_vmcs(void)
1031 {
1032         u64 vmx_msr;
1033         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1034         /* check if the cpu supports writing r/o exit information fields */
1035         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1036                 return false;
1037
1038         return vmcs_config.cpu_based_2nd_exec_ctrl &
1039                 SECONDARY_EXEC_SHADOW_VMCS;
1040 }
1041
1042 static inline bool report_flexpriority(void)
1043 {
1044         return flexpriority_enabled;
1045 }
1046
1047 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1048 {
1049         return vmcs12->cpu_based_vm_exec_control & bit;
1050 }
1051
1052 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1053 {
1054         return (vmcs12->cpu_based_vm_exec_control &
1055                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1056                 (vmcs12->secondary_vm_exec_control & bit);
1057 }
1058
1059 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1060 {
1061         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1062 }
1063
1064 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1065 {
1066         return vmcs12->pin_based_vm_exec_control &
1067                 PIN_BASED_VMX_PREEMPTION_TIMER;
1068 }
1069
1070 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1071 {
1072         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1073 }
1074
1075 static inline bool is_exception(u32 intr_info)
1076 {
1077         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1078                 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1079 }
1080
1081 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1082                               u32 exit_intr_info,
1083                               unsigned long exit_qualification);
1084 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1085                         struct vmcs12 *vmcs12,
1086                         u32 reason, unsigned long qualification);
1087
1088 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1089 {
1090         int i;
1091
1092         for (i = 0; i < vmx->nmsrs; ++i)
1093                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1094                         return i;
1095         return -1;
1096 }
1097
1098 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1099 {
1100     struct {
1101         u64 vpid : 16;
1102         u64 rsvd : 48;
1103         u64 gva;
1104     } operand = { vpid, 0, gva };
1105
1106     asm volatile (__ex(ASM_VMX_INVVPID)
1107                   /* CF==1 or ZF==1 --> rc = -1 */
1108                   "; ja 1f ; ud2 ; 1:"
1109                   : : "a"(&operand), "c"(ext) : "cc", "memory");
1110 }
1111
1112 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1113 {
1114         struct {
1115                 u64 eptp, gpa;
1116         } operand = {eptp, gpa};
1117
1118         asm volatile (__ex(ASM_VMX_INVEPT)
1119                         /* CF==1 or ZF==1 --> rc = -1 */
1120                         "; ja 1f ; ud2 ; 1:\n"
1121                         : : "a" (&operand), "c" (ext) : "cc", "memory");
1122 }
1123
1124 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1125 {
1126         int i;
1127
1128         i = __find_msr_index(vmx, msr);
1129         if (i >= 0)
1130                 return &vmx->guest_msrs[i];
1131         return NULL;
1132 }
1133
1134 static void vmcs_clear(struct vmcs *vmcs)
1135 {
1136         u64 phys_addr = __pa(vmcs);
1137         u8 error;
1138
1139         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1140                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1141                       : "cc", "memory");
1142         if (error)
1143                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1144                        vmcs, phys_addr);
1145 }
1146
1147 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1148 {
1149         vmcs_clear(loaded_vmcs->vmcs);
1150         loaded_vmcs->cpu = -1;
1151         loaded_vmcs->launched = 0;
1152 }
1153
1154 static void vmcs_load(struct vmcs *vmcs)
1155 {
1156         u64 phys_addr = __pa(vmcs);
1157         u8 error;
1158
1159         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1160                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1161                         : "cc", "memory");
1162         if (error)
1163                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1164                        vmcs, phys_addr);
1165 }
1166
1167 #ifdef CONFIG_KEXEC
1168 /*
1169  * This bitmap is used to indicate whether the vmclear
1170  * operation is enabled on all cpus. All disabled by
1171  * default.
1172  */
1173 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1174
1175 static inline void crash_enable_local_vmclear(int cpu)
1176 {
1177         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1178 }
1179
1180 static inline void crash_disable_local_vmclear(int cpu)
1181 {
1182         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1183 }
1184
1185 static inline int crash_local_vmclear_enabled(int cpu)
1186 {
1187         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1188 }
1189
1190 static void crash_vmclear_local_loaded_vmcss(void)
1191 {
1192         int cpu = raw_smp_processor_id();
1193         struct loaded_vmcs *v;
1194
1195         if (!crash_local_vmclear_enabled(cpu))
1196                 return;
1197
1198         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1199                             loaded_vmcss_on_cpu_link)
1200                 vmcs_clear(v->vmcs);
1201 }
1202 #else
1203 static inline void crash_enable_local_vmclear(int cpu) { }
1204 static inline void crash_disable_local_vmclear(int cpu) { }
1205 #endif /* CONFIG_KEXEC */
1206
1207 static void __loaded_vmcs_clear(void *arg)
1208 {
1209         struct loaded_vmcs *loaded_vmcs = arg;
1210         int cpu = raw_smp_processor_id();
1211
1212         if (loaded_vmcs->cpu != cpu)
1213                 return; /* vcpu migration can race with cpu offline */
1214         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1215                 per_cpu(current_vmcs, cpu) = NULL;
1216         crash_disable_local_vmclear(cpu);
1217         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1218
1219         /*
1220          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1221          * is before setting loaded_vmcs->vcpu to -1 which is done in
1222          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1223          * then adds the vmcs into percpu list before it is deleted.
1224          */
1225         smp_wmb();
1226
1227         loaded_vmcs_init(loaded_vmcs);
1228         crash_enable_local_vmclear(cpu);
1229 }
1230
1231 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1232 {
1233         int cpu = loaded_vmcs->cpu;
1234
1235         if (cpu != -1)
1236                 smp_call_function_single(cpu,
1237                          __loaded_vmcs_clear, loaded_vmcs, 1);
1238 }
1239
1240 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1241 {
1242         if (vmx->vpid == 0)
1243                 return;
1244
1245         if (cpu_has_vmx_invvpid_single())
1246                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1247 }
1248
1249 static inline void vpid_sync_vcpu_global(void)
1250 {
1251         if (cpu_has_vmx_invvpid_global())
1252                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1253 }
1254
1255 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
1256 {
1257         if (cpu_has_vmx_invvpid_single())
1258                 vpid_sync_vcpu_single(vmx);
1259         else
1260                 vpid_sync_vcpu_global();
1261 }
1262
1263 static inline void ept_sync_global(void)
1264 {
1265         if (cpu_has_vmx_invept_global())
1266                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1267 }
1268
1269 static inline void ept_sync_context(u64 eptp)
1270 {
1271         if (enable_ept) {
1272                 if (cpu_has_vmx_invept_context())
1273                         __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1274                 else
1275                         ept_sync_global();
1276         }
1277 }
1278
1279 static __always_inline unsigned long vmcs_readl(unsigned long field)
1280 {
1281         unsigned long value;
1282
1283         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1284                       : "=a"(value) : "d"(field) : "cc");
1285         return value;
1286 }
1287
1288 static __always_inline u16 vmcs_read16(unsigned long field)
1289 {
1290         return vmcs_readl(field);
1291 }
1292
1293 static __always_inline u32 vmcs_read32(unsigned long field)
1294 {
1295         return vmcs_readl(field);
1296 }
1297
1298 static __always_inline u64 vmcs_read64(unsigned long field)
1299 {
1300 #ifdef CONFIG_X86_64
1301         return vmcs_readl(field);
1302 #else
1303         return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
1304 #endif
1305 }
1306
1307 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1308 {
1309         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1310                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1311         dump_stack();
1312 }
1313
1314 static void vmcs_writel(unsigned long field, unsigned long value)
1315 {
1316         u8 error;
1317
1318         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1319                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1320         if (unlikely(error))
1321                 vmwrite_error(field, value);
1322 }
1323
1324 static void vmcs_write16(unsigned long field, u16 value)
1325 {
1326         vmcs_writel(field, value);
1327 }
1328
1329 static void vmcs_write32(unsigned long field, u32 value)
1330 {
1331         vmcs_writel(field, value);
1332 }
1333
1334 static void vmcs_write64(unsigned long field, u64 value)
1335 {
1336         vmcs_writel(field, value);
1337 #ifndef CONFIG_X86_64
1338         asm volatile ("");
1339         vmcs_writel(field+1, value >> 32);
1340 #endif
1341 }
1342
1343 static void vmcs_clear_bits(unsigned long field, u32 mask)
1344 {
1345         vmcs_writel(field, vmcs_readl(field) & ~mask);
1346 }
1347
1348 static void vmcs_set_bits(unsigned long field, u32 mask)
1349 {
1350         vmcs_writel(field, vmcs_readl(field) | mask);
1351 }
1352
1353 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1354 {
1355         vmcs_write32(VM_ENTRY_CONTROLS, val);
1356         vmx->vm_entry_controls_shadow = val;
1357 }
1358
1359 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1360 {
1361         if (vmx->vm_entry_controls_shadow != val)
1362                 vm_entry_controls_init(vmx, val);
1363 }
1364
1365 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1366 {
1367         return vmx->vm_entry_controls_shadow;
1368 }
1369
1370
1371 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1372 {
1373         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1374 }
1375
1376 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1377 {
1378         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1379 }
1380
1381 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1382 {
1383         vmcs_write32(VM_EXIT_CONTROLS, val);
1384         vmx->vm_exit_controls_shadow = val;
1385 }
1386
1387 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1388 {
1389         if (vmx->vm_exit_controls_shadow != val)
1390                 vm_exit_controls_init(vmx, val);
1391 }
1392
1393 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1394 {
1395         return vmx->vm_exit_controls_shadow;
1396 }
1397
1398
1399 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1400 {
1401         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1402 }
1403
1404 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1405 {
1406         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1407 }
1408
1409 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1410 {
1411         vmx->segment_cache.bitmask = 0;
1412 }
1413
1414 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1415                                        unsigned field)
1416 {
1417         bool ret;
1418         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1419
1420         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1421                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1422                 vmx->segment_cache.bitmask = 0;
1423         }
1424         ret = vmx->segment_cache.bitmask & mask;
1425         vmx->segment_cache.bitmask |= mask;
1426         return ret;
1427 }
1428
1429 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1430 {
1431         u16 *p = &vmx->segment_cache.seg[seg].selector;
1432
1433         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1434                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1435         return *p;
1436 }
1437
1438 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1439 {
1440         ulong *p = &vmx->segment_cache.seg[seg].base;
1441
1442         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1443                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1444         return *p;
1445 }
1446
1447 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1448 {
1449         u32 *p = &vmx->segment_cache.seg[seg].limit;
1450
1451         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1452                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1453         return *p;
1454 }
1455
1456 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1457 {
1458         u32 *p = &vmx->segment_cache.seg[seg].ar;
1459
1460         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1461                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1462         return *p;
1463 }
1464
1465 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1466 {
1467         u32 eb;
1468
1469         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1470              (1u << NM_VECTOR) | (1u << DB_VECTOR);
1471         if ((vcpu->guest_debug &
1472              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1473             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1474                 eb |= 1u << BP_VECTOR;
1475         if (to_vmx(vcpu)->rmode.vm86_active)
1476                 eb = ~0;
1477         if (enable_ept)
1478                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1479         if (vcpu->fpu_active)
1480                 eb &= ~(1u << NM_VECTOR);
1481
1482         /* When we are running a nested L2 guest and L1 specified for it a
1483          * certain exception bitmap, we must trap the same exceptions and pass
1484          * them to L1. When running L2, we will only handle the exceptions
1485          * specified above if L1 did not want them.
1486          */
1487         if (is_guest_mode(vcpu))
1488                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1489
1490         vmcs_write32(EXCEPTION_BITMAP, eb);
1491 }
1492
1493 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1494                 unsigned long entry, unsigned long exit)
1495 {
1496         vm_entry_controls_clearbit(vmx, entry);
1497         vm_exit_controls_clearbit(vmx, exit);
1498 }
1499
1500 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1501 {
1502         unsigned i;
1503         struct msr_autoload *m = &vmx->msr_autoload;
1504
1505         switch (msr) {
1506         case MSR_EFER:
1507                 if (cpu_has_load_ia32_efer) {
1508                         clear_atomic_switch_msr_special(vmx,
1509                                         VM_ENTRY_LOAD_IA32_EFER,
1510                                         VM_EXIT_LOAD_IA32_EFER);
1511                         return;
1512                 }
1513                 break;
1514         case MSR_CORE_PERF_GLOBAL_CTRL:
1515                 if (cpu_has_load_perf_global_ctrl) {
1516                         clear_atomic_switch_msr_special(vmx,
1517                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1518                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1519                         return;
1520                 }
1521                 break;
1522         }
1523
1524         for (i = 0; i < m->nr; ++i)
1525                 if (m->guest[i].index == msr)
1526                         break;
1527
1528         if (i == m->nr)
1529                 return;
1530         --m->nr;
1531         m->guest[i] = m->guest[m->nr];
1532         m->host[i] = m->host[m->nr];
1533         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1534         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1535 }
1536
1537 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1538                 unsigned long entry, unsigned long exit,
1539                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1540                 u64 guest_val, u64 host_val)
1541 {
1542         vmcs_write64(guest_val_vmcs, guest_val);
1543         vmcs_write64(host_val_vmcs, host_val);
1544         vm_entry_controls_setbit(vmx, entry);
1545         vm_exit_controls_setbit(vmx, exit);
1546 }
1547
1548 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1549                                   u64 guest_val, u64 host_val)
1550 {
1551         unsigned i;
1552         struct msr_autoload *m = &vmx->msr_autoload;
1553
1554         switch (msr) {
1555         case MSR_EFER:
1556                 if (cpu_has_load_ia32_efer) {
1557                         add_atomic_switch_msr_special(vmx,
1558                                         VM_ENTRY_LOAD_IA32_EFER,
1559                                         VM_EXIT_LOAD_IA32_EFER,
1560                                         GUEST_IA32_EFER,
1561                                         HOST_IA32_EFER,
1562                                         guest_val, host_val);
1563                         return;
1564                 }
1565                 break;
1566         case MSR_CORE_PERF_GLOBAL_CTRL:
1567                 if (cpu_has_load_perf_global_ctrl) {
1568                         add_atomic_switch_msr_special(vmx,
1569                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1570                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1571                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1572                                         HOST_IA32_PERF_GLOBAL_CTRL,
1573                                         guest_val, host_val);
1574                         return;
1575                 }
1576                 break;
1577         }
1578
1579         for (i = 0; i < m->nr; ++i)
1580                 if (m->guest[i].index == msr)
1581                         break;
1582
1583         if (i == NR_AUTOLOAD_MSRS) {
1584                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1585                                 "Can't add msr %x\n", msr);
1586                 return;
1587         } else if (i == m->nr) {
1588                 ++m->nr;
1589                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1590                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1591         }
1592
1593         m->guest[i].index = msr;
1594         m->guest[i].value = guest_val;
1595         m->host[i].index = msr;
1596         m->host[i].value = host_val;
1597 }
1598
1599 static void reload_tss(void)
1600 {
1601         /*
1602          * VT restores TR but not its size.  Useless.
1603          */
1604         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1605         struct desc_struct *descs;
1606
1607         descs = (void *)gdt->address;
1608         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
1609         load_TR_desc();
1610 }
1611
1612 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1613 {
1614         u64 guest_efer;
1615         u64 ignore_bits;
1616
1617         guest_efer = vmx->vcpu.arch.efer;
1618
1619         /*
1620          * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1621          * outside long mode
1622          */
1623         ignore_bits = EFER_NX | EFER_SCE;
1624 #ifdef CONFIG_X86_64
1625         ignore_bits |= EFER_LMA | EFER_LME;
1626         /* SCE is meaningful only in long mode on Intel */
1627         if (guest_efer & EFER_LMA)
1628                 ignore_bits &= ~(u64)EFER_SCE;
1629 #endif
1630         guest_efer &= ~ignore_bits;
1631         guest_efer |= host_efer & ignore_bits;
1632         vmx->guest_msrs[efer_offset].data = guest_efer;
1633         vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1634
1635         clear_atomic_switch_msr(vmx, MSR_EFER);
1636         /* On ept, can't emulate nx, and must switch nx atomically */
1637         if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
1638                 guest_efer = vmx->vcpu.arch.efer;
1639                 if (!(guest_efer & EFER_LMA))
1640                         guest_efer &= ~EFER_LME;
1641                 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
1642                 return false;
1643         }
1644
1645         return true;
1646 }
1647
1648 static unsigned long segment_base(u16 selector)
1649 {
1650         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1651         struct desc_struct *d;
1652         unsigned long table_base;
1653         unsigned long v;
1654
1655         if (!(selector & ~3))
1656                 return 0;
1657
1658         table_base = gdt->address;
1659
1660         if (selector & 4) {           /* from ldt */
1661                 u16 ldt_selector = kvm_read_ldt();
1662
1663                 if (!(ldt_selector & ~3))
1664                         return 0;
1665
1666                 table_base = segment_base(ldt_selector);
1667         }
1668         d = (struct desc_struct *)(table_base + (selector & ~7));
1669         v = get_desc_base(d);
1670 #ifdef CONFIG_X86_64
1671        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
1672                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
1673 #endif
1674         return v;
1675 }
1676
1677 static inline unsigned long kvm_read_tr_base(void)
1678 {
1679         u16 tr;
1680         asm("str %0" : "=g"(tr));
1681         return segment_base(tr);
1682 }
1683
1684 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1685 {
1686         struct vcpu_vmx *vmx = to_vmx(vcpu);
1687         int i;
1688
1689         if (vmx->host_state.loaded)
1690                 return;
1691
1692         vmx->host_state.loaded = 1;
1693         /*
1694          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1695          * allow segment selectors with cpl > 0 or ti == 1.
1696          */
1697         vmx->host_state.ldt_sel = kvm_read_ldt();
1698         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1699         savesegment(fs, vmx->host_state.fs_sel);
1700         if (!(vmx->host_state.fs_sel & 7)) {
1701                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1702                 vmx->host_state.fs_reload_needed = 0;
1703         } else {
1704                 vmcs_write16(HOST_FS_SELECTOR, 0);
1705                 vmx->host_state.fs_reload_needed = 1;
1706         }
1707         savesegment(gs, vmx->host_state.gs_sel);
1708         if (!(vmx->host_state.gs_sel & 7))
1709                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1710         else {
1711                 vmcs_write16(HOST_GS_SELECTOR, 0);
1712                 vmx->host_state.gs_ldt_reload_needed = 1;
1713         }
1714
1715 #ifdef CONFIG_X86_64
1716         savesegment(ds, vmx->host_state.ds_sel);
1717         savesegment(es, vmx->host_state.es_sel);
1718 #endif
1719
1720 #ifdef CONFIG_X86_64
1721         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1722         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1723 #else
1724         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1725         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1726 #endif
1727
1728 #ifdef CONFIG_X86_64
1729         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1730         if (is_long_mode(&vmx->vcpu))
1731                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1732 #endif
1733         if (boot_cpu_has(X86_FEATURE_MPX))
1734                 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1735         for (i = 0; i < vmx->save_nmsrs; ++i)
1736                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1737                                    vmx->guest_msrs[i].data,
1738                                    vmx->guest_msrs[i].mask);
1739 }
1740
1741 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1742 {
1743         if (!vmx->host_state.loaded)
1744                 return;
1745
1746         ++vmx->vcpu.stat.host_state_reload;
1747         vmx->host_state.loaded = 0;
1748 #ifdef CONFIG_X86_64
1749         if (is_long_mode(&vmx->vcpu))
1750                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1751 #endif
1752         if (vmx->host_state.gs_ldt_reload_needed) {
1753                 kvm_load_ldt(vmx->host_state.ldt_sel);
1754 #ifdef CONFIG_X86_64
1755                 load_gs_index(vmx->host_state.gs_sel);
1756 #else
1757                 loadsegment(gs, vmx->host_state.gs_sel);
1758 #endif
1759         }
1760         if (vmx->host_state.fs_reload_needed)
1761                 loadsegment(fs, vmx->host_state.fs_sel);
1762 #ifdef CONFIG_X86_64
1763         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1764                 loadsegment(ds, vmx->host_state.ds_sel);
1765                 loadsegment(es, vmx->host_state.es_sel);
1766         }
1767 #endif
1768         reload_tss();
1769 #ifdef CONFIG_X86_64
1770         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1771 #endif
1772         if (vmx->host_state.msr_host_bndcfgs)
1773                 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1774         /*
1775          * If the FPU is not active (through the host task or
1776          * the guest vcpu), then restore the cr0.TS bit.
1777          */
1778         if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
1779                 stts();
1780         load_gdt(&__get_cpu_var(host_gdt));
1781 }
1782
1783 static void vmx_load_host_state(struct vcpu_vmx *vmx)
1784 {
1785         preempt_disable();
1786         __vmx_load_host_state(vmx);
1787         preempt_enable();
1788 }
1789
1790 /*
1791  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1792  * vcpu mutex is already taken.
1793  */
1794 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1795 {
1796         struct vcpu_vmx *vmx = to_vmx(vcpu);
1797         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1798
1799         if (!vmm_exclusive)
1800                 kvm_cpu_vmxon(phys_addr);
1801         else if (vmx->loaded_vmcs->cpu != cpu)
1802                 loaded_vmcs_clear(vmx->loaded_vmcs);
1803
1804         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1805                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1806                 vmcs_load(vmx->loaded_vmcs->vmcs);
1807         }
1808
1809         if (vmx->loaded_vmcs->cpu != cpu) {
1810                 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1811                 unsigned long sysenter_esp;
1812
1813                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1814                 local_irq_disable();
1815                 crash_disable_local_vmclear(cpu);
1816
1817                 /*
1818                  * Read loaded_vmcs->cpu should be before fetching
1819                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
1820                  * See the comments in __loaded_vmcs_clear().
1821                  */
1822                 smp_rmb();
1823
1824                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1825                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1826                 crash_enable_local_vmclear(cpu);
1827                 local_irq_enable();
1828
1829                 /*
1830                  * Linux uses per-cpu TSS and GDT, so set these when switching
1831                  * processors.
1832                  */
1833                 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1834                 vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
1835
1836                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1837                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1838                 vmx->loaded_vmcs->cpu = cpu;
1839         }
1840 }
1841
1842 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1843 {
1844         __vmx_load_host_state(to_vmx(vcpu));
1845         if (!vmm_exclusive) {
1846                 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1847                 vcpu->cpu = -1;
1848                 kvm_cpu_vmxoff();
1849         }
1850 }
1851
1852 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1853 {
1854         ulong cr0;
1855
1856         if (vcpu->fpu_active)
1857                 return;
1858         vcpu->fpu_active = 1;
1859         cr0 = vmcs_readl(GUEST_CR0);
1860         cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1861         cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1862         vmcs_writel(GUEST_CR0, cr0);
1863         update_exception_bitmap(vcpu);
1864         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1865         if (is_guest_mode(vcpu))
1866                 vcpu->arch.cr0_guest_owned_bits &=
1867                         ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1868         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1869 }
1870
1871 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1872
1873 /*
1874  * Return the cr0 value that a nested guest would read. This is a combination
1875  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1876  * its hypervisor (cr0_read_shadow).
1877  */
1878 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1879 {
1880         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1881                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1882 }
1883 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1884 {
1885         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1886                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1887 }
1888
1889 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1890 {
1891         /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1892          * set this *before* calling this function.
1893          */
1894         vmx_decache_cr0_guest_bits(vcpu);
1895         vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1896         update_exception_bitmap(vcpu);
1897         vcpu->arch.cr0_guest_owned_bits = 0;
1898         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1899         if (is_guest_mode(vcpu)) {
1900                 /*
1901                  * L1's specified read shadow might not contain the TS bit,
1902                  * so now that we turned on shadowing of this bit, we need to
1903                  * set this bit of the shadow. Like in nested_vmx_run we need
1904                  * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1905                  * up-to-date here because we just decached cr0.TS (and we'll
1906                  * only update vmcs12->guest_cr0 on nested exit).
1907                  */
1908                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1909                 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1910                         (vcpu->arch.cr0 & X86_CR0_TS);
1911                 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1912         } else
1913                 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1914 }
1915
1916 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1917 {
1918         unsigned long rflags, save_rflags;
1919
1920         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1921                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1922                 rflags = vmcs_readl(GUEST_RFLAGS);
1923                 if (to_vmx(vcpu)->rmode.vm86_active) {
1924                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1925                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1926                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1927                 }
1928                 to_vmx(vcpu)->rflags = rflags;
1929         }
1930         return to_vmx(vcpu)->rflags;
1931 }
1932
1933 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1934 {
1935         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1936         to_vmx(vcpu)->rflags = rflags;
1937         if (to_vmx(vcpu)->rmode.vm86_active) {
1938                 to_vmx(vcpu)->rmode.save_rflags = rflags;
1939                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1940         }
1941         vmcs_writel(GUEST_RFLAGS, rflags);
1942 }
1943
1944 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1945 {
1946         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1947         int ret = 0;
1948
1949         if (interruptibility & GUEST_INTR_STATE_STI)
1950                 ret |= KVM_X86_SHADOW_INT_STI;
1951         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1952                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1953
1954         return ret;
1955 }
1956
1957 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1958 {
1959         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1960         u32 interruptibility = interruptibility_old;
1961
1962         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1963
1964         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1965                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1966         else if (mask & KVM_X86_SHADOW_INT_STI)
1967                 interruptibility |= GUEST_INTR_STATE_STI;
1968
1969         if ((interruptibility != interruptibility_old))
1970                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1971 }
1972
1973 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1974 {
1975         unsigned long rip;
1976
1977         rip = kvm_rip_read(vcpu);
1978         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1979         kvm_rip_write(vcpu, rip);
1980
1981         /* skipping an emulated instruction also counts */
1982         vmx_set_interrupt_shadow(vcpu, 0);
1983 }
1984
1985 /*
1986  * KVM wants to inject page-faults which it got to the guest. This function
1987  * checks whether in a nested guest, we need to inject them to L1 or L2.
1988  */
1989 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
1990 {
1991         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1992
1993         if (!(vmcs12->exception_bitmap & (1u << nr)))
1994                 return 0;
1995
1996         nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
1997                           vmcs_read32(VM_EXIT_INTR_INFO),
1998                           vmcs_readl(EXIT_QUALIFICATION));
1999         return 1;
2000 }
2001
2002 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
2003                                 bool has_error_code, u32 error_code,
2004                                 bool reinject)
2005 {
2006         struct vcpu_vmx *vmx = to_vmx(vcpu);
2007         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2008
2009         if (!reinject && is_guest_mode(vcpu) &&
2010             nested_vmx_check_exception(vcpu, nr))
2011                 return;
2012
2013         if (has_error_code) {
2014                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2015                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2016         }
2017
2018         if (vmx->rmode.vm86_active) {
2019                 int inc_eip = 0;
2020                 if (kvm_exception_is_soft(nr))
2021                         inc_eip = vcpu->arch.event_exit_inst_len;
2022                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2023                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2024                 return;
2025         }
2026
2027         if (kvm_exception_is_soft(nr)) {
2028                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2029                              vmx->vcpu.arch.event_exit_inst_len);
2030                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2031         } else
2032                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2033
2034         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2035 }
2036
2037 static bool vmx_rdtscp_supported(void)
2038 {
2039         return cpu_has_vmx_rdtscp();
2040 }
2041
2042 static bool vmx_invpcid_supported(void)
2043 {
2044         return cpu_has_vmx_invpcid() && enable_ept;
2045 }
2046
2047 /*
2048  * Swap MSR entry in host/guest MSR entry array.
2049  */
2050 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2051 {
2052         struct shared_msr_entry tmp;
2053
2054         tmp = vmx->guest_msrs[to];
2055         vmx->guest_msrs[to] = vmx->guest_msrs[from];
2056         vmx->guest_msrs[from] = tmp;
2057 }
2058
2059 static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2060 {
2061         unsigned long *msr_bitmap;
2062
2063         if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
2064                 if (is_long_mode(vcpu))
2065                         msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2066                 else
2067                         msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2068         } else {
2069                 if (is_long_mode(vcpu))
2070                         msr_bitmap = vmx_msr_bitmap_longmode;
2071                 else
2072                         msr_bitmap = vmx_msr_bitmap_legacy;
2073         }
2074
2075         vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2076 }
2077
2078 /*
2079  * Set up the vmcs to automatically save and restore system
2080  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2081  * mode, as fiddling with msrs is very expensive.
2082  */
2083 static void setup_msrs(struct vcpu_vmx *vmx)
2084 {
2085         int save_nmsrs, index;
2086
2087         save_nmsrs = 0;
2088 #ifdef CONFIG_X86_64
2089         if (is_long_mode(&vmx->vcpu)) {
2090                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2091                 if (index >= 0)
2092                         move_msr_up(vmx, index, save_nmsrs++);
2093                 index = __find_msr_index(vmx, MSR_LSTAR);
2094                 if (index >= 0)
2095                         move_msr_up(vmx, index, save_nmsrs++);
2096                 index = __find_msr_index(vmx, MSR_CSTAR);
2097                 if (index >= 0)
2098                         move_msr_up(vmx, index, save_nmsrs++);
2099                 index = __find_msr_index(vmx, MSR_TSC_AUX);
2100                 if (index >= 0 && vmx->rdtscp_enabled)
2101                         move_msr_up(vmx, index, save_nmsrs++);
2102                 /*
2103                  * MSR_STAR is only needed on long mode guests, and only
2104                  * if efer.sce is enabled.
2105                  */
2106                 index = __find_msr_index(vmx, MSR_STAR);
2107                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2108                         move_msr_up(vmx, index, save_nmsrs++);
2109         }
2110 #endif
2111         index = __find_msr_index(vmx, MSR_EFER);
2112         if (index >= 0 && update_transition_efer(vmx, index))
2113                 move_msr_up(vmx, index, save_nmsrs++);
2114
2115         vmx->save_nmsrs = save_nmsrs;
2116
2117         if (cpu_has_vmx_msr_bitmap())
2118                 vmx_set_msr_bitmap(&vmx->vcpu);
2119 }
2120
2121 /*
2122  * reads and returns guest's timestamp counter "register"
2123  * guest_tsc = host_tsc + tsc_offset    -- 21.3
2124  */
2125 static u64 guest_read_tsc(void)
2126 {
2127         u64 host_tsc, tsc_offset;
2128
2129         rdtscll(host_tsc);
2130         tsc_offset = vmcs_read64(TSC_OFFSET);
2131         return host_tsc + tsc_offset;
2132 }
2133
2134 /*
2135  * Like guest_read_tsc, but always returns L1's notion of the timestamp
2136  * counter, even if a nested guest (L2) is currently running.
2137  */
2138 u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2139 {
2140         u64 tsc_offset;
2141
2142         tsc_offset = is_guest_mode(vcpu) ?
2143                 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
2144                 vmcs_read64(TSC_OFFSET);
2145         return host_tsc + tsc_offset;
2146 }
2147
2148 /*
2149  * Engage any workarounds for mis-matched TSC rates.  Currently limited to
2150  * software catchup for faster rates on slower CPUs.
2151  */
2152 static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2153 {
2154         if (!scale)
2155                 return;
2156
2157         if (user_tsc_khz > tsc_khz) {
2158                 vcpu->arch.tsc_catchup = 1;
2159                 vcpu->arch.tsc_always_catchup = 1;
2160         } else
2161                 WARN(1, "user requested TSC rate below hardware speed\n");
2162 }
2163
2164 static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
2165 {
2166         return vmcs_read64(TSC_OFFSET);
2167 }
2168
2169 /*
2170  * writes 'offset' into guest's timestamp counter offset register
2171  */
2172 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2173 {
2174         if (is_guest_mode(vcpu)) {
2175                 /*
2176                  * We're here if L1 chose not to trap WRMSR to TSC. According
2177                  * to the spec, this should set L1's TSC; The offset that L1
2178                  * set for L2 remains unchanged, and still needs to be added
2179                  * to the newly set TSC to get L2's TSC.
2180                  */
2181                 struct vmcs12 *vmcs12;
2182                 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
2183                 /* recalculate vmcs02.TSC_OFFSET: */
2184                 vmcs12 = get_vmcs12(vcpu);
2185                 vmcs_write64(TSC_OFFSET, offset +
2186                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2187                          vmcs12->tsc_offset : 0));
2188         } else {
2189                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2190                                            vmcs_read64(TSC_OFFSET), offset);
2191                 vmcs_write64(TSC_OFFSET, offset);
2192         }
2193 }
2194
2195 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2196 {
2197         u64 offset = vmcs_read64(TSC_OFFSET);
2198
2199         vmcs_write64(TSC_OFFSET, offset + adjustment);
2200         if (is_guest_mode(vcpu)) {
2201                 /* Even when running L2, the adjustment needs to apply to L1 */
2202                 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2203         } else
2204                 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2205                                            offset + adjustment);
2206 }
2207
2208 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2209 {
2210         return target_tsc - native_read_tsc();
2211 }
2212
2213 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2214 {
2215         struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2216         return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2217 }
2218
2219 /*
2220  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2221  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2222  * all guests if the "nested" module option is off, and can also be disabled
2223  * for a single guest by disabling its VMX cpuid bit.
2224  */
2225 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2226 {
2227         return nested && guest_cpuid_has_vmx(vcpu);
2228 }
2229
2230 /*
2231  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2232  * returned for the various VMX controls MSRs when nested VMX is enabled.
2233  * The same values should also be used to verify that vmcs12 control fields are
2234  * valid during nested entry from L1 to L2.
2235  * Each of these control msrs has a low and high 32-bit half: A low bit is on
2236  * if the corresponding bit in the (32-bit) control field *must* be on, and a
2237  * bit in the high half is on if the corresponding bit in the control field
2238  * may be on. See also vmx_control_verify().
2239  * TODO: allow these variables to be modified (downgraded) by module options
2240  * or other means.
2241  */
2242 static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
2243 static u32 nested_vmx_true_procbased_ctls_low;
2244 static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2245 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2246 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2247 static u32 nested_vmx_true_exit_ctls_low;
2248 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2249 static u32 nested_vmx_true_entry_ctls_low;
2250 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2251 static u32 nested_vmx_ept_caps;
2252 static __init void nested_vmx_setup_ctls_msrs(void)
2253 {
2254         /*
2255          * Note that as a general rule, the high half of the MSRs (bits in
2256          * the control fields which may be 1) should be initialized by the
2257          * intersection of the underlying hardware's MSR (i.e., features which
2258          * can be supported) and the list of features we want to expose -
2259          * because they are known to be properly supported in our code.
2260          * Also, usually, the low half of the MSRs (bits which must be 1) can
2261          * be set to 0, meaning that L1 may turn off any of these bits. The
2262          * reason is that if one of these bits is necessary, it will appear
2263          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2264          * fields of vmcs01 and vmcs02, will turn these bits off - and
2265          * nested_vmx_exit_handled() will not pass related exits to L1.
2266          * These rules have exceptions below.
2267          */
2268
2269         /* pin-based controls */
2270         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2271               nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
2272         nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2273         nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2274                 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
2275         nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2276                 PIN_BASED_VMX_PREEMPTION_TIMER;
2277
2278         /* exit controls */
2279         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2280                 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2281         nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2282
2283         nested_vmx_exit_ctls_high &=
2284 #ifdef CONFIG_X86_64
2285                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2286 #endif
2287                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2288         nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2289                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2290                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2291
2292         if (vmx_mpx_supported())
2293                 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2294
2295         /* We support free control of debug control saving. */
2296         nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
2297                 ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2298
2299         /* entry controls */
2300         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2301                 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
2302         nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2303         nested_vmx_entry_ctls_high &=
2304 #ifdef CONFIG_X86_64
2305                 VM_ENTRY_IA32E_MODE |
2306 #endif
2307                 VM_ENTRY_LOAD_IA32_PAT;
2308         nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2309                                        VM_ENTRY_LOAD_IA32_EFER);
2310         if (vmx_mpx_supported())
2311                 nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2312
2313         /* We support free control of debug control loading. */
2314         nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
2315                 ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2316
2317         /* cpu-based controls */
2318         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2319                 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
2320         nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2321         nested_vmx_procbased_ctls_high &=
2322                 CPU_BASED_VIRTUAL_INTR_PENDING |
2323                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2324                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2325                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2326                 CPU_BASED_CR3_STORE_EXITING |
2327 #ifdef CONFIG_X86_64
2328                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2329 #endif
2330                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2331                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2332                 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2333                 CPU_BASED_PAUSE_EXITING |
2334                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2335         /*
2336          * We can allow some features even when not supported by the
2337          * hardware. For example, L1 can specify an MSR bitmap - and we
2338          * can use it to avoid exits to L1 - even when L0 runs L2
2339          * without MSR bitmaps.
2340          */
2341         nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2342                 CPU_BASED_USE_MSR_BITMAPS;
2343
2344         /* We support free control of CR3 access interception. */
2345         nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
2346                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2347
2348         /* secondary cpu-based controls */
2349         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2350                 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
2351         nested_vmx_secondary_ctls_low = 0;
2352         nested_vmx_secondary_ctls_high &=
2353                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2354                 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2355                 SECONDARY_EXEC_WBINVD_EXITING;
2356
2357         if (enable_ept) {
2358                 /* nested EPT: emulate EPT also to L1 */
2359                 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
2360                 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2361                          VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2362                          VMX_EPT_INVEPT_BIT;
2363                 nested_vmx_ept_caps &= vmx_capability.ept;
2364                 /*
2365                  * For nested guests, we don't do anything specific
2366                  * for single context invalidation. Hence, only advertise
2367                  * support for global context invalidation.
2368                  */
2369                 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2370         } else
2371                 nested_vmx_ept_caps = 0;
2372
2373         /* miscellaneous data */
2374         rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2375         nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2376         nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2377                 VMX_MISC_ACTIVITY_HLT;
2378         nested_vmx_misc_high = 0;
2379 }
2380
2381 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2382 {
2383         /*
2384          * Bits 0 in high must be 0, and bits 1 in low must be 1.
2385          */
2386         return ((control & high) | low) == control;
2387 }
2388
2389 static inline u64 vmx_control_msr(u32 low, u32 high)
2390 {
2391         return low | ((u64)high << 32);
2392 }
2393
2394 /* Returns 0 on success, non-0 otherwise. */
2395 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2396 {
2397         switch (msr_index) {
2398         case MSR_IA32_VMX_BASIC:
2399                 /*
2400                  * This MSR reports some information about VMX support. We
2401                  * should return information about the VMX we emulate for the
2402                  * guest, and the VMCS structure we give it - not about the
2403                  * VMX support of the underlying hardware.
2404                  */
2405                 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
2406                            ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2407                            (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2408                 break;
2409         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2410         case MSR_IA32_VMX_PINBASED_CTLS:
2411                 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
2412                                         nested_vmx_pinbased_ctls_high);
2413                 break;
2414         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2415                 *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
2416                                         nested_vmx_procbased_ctls_high);
2417                 break;
2418         case MSR_IA32_VMX_PROCBASED_CTLS:
2419                 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
2420                                         nested_vmx_procbased_ctls_high);
2421                 break;
2422         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2423                 *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
2424                                         nested_vmx_exit_ctls_high);
2425                 break;
2426         case MSR_IA32_VMX_EXIT_CTLS:
2427                 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
2428                                         nested_vmx_exit_ctls_high);
2429                 break;
2430         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2431                 *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
2432                                         nested_vmx_entry_ctls_high);
2433                 break;
2434         case MSR_IA32_VMX_ENTRY_CTLS:
2435                 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
2436                                         nested_vmx_entry_ctls_high);
2437                 break;
2438         case MSR_IA32_VMX_MISC:
2439                 *pdata = vmx_control_msr(nested_vmx_misc_low,
2440                                          nested_vmx_misc_high);
2441                 break;
2442         /*
2443          * These MSRs specify bits which the guest must keep fixed (on or off)
2444          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2445          * We picked the standard core2 setting.
2446          */
2447 #define VMXON_CR0_ALWAYSON      (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2448 #define VMXON_CR4_ALWAYSON      X86_CR4_VMXE
2449         case MSR_IA32_VMX_CR0_FIXED0:
2450                 *pdata = VMXON_CR0_ALWAYSON;
2451                 break;
2452         case MSR_IA32_VMX_CR0_FIXED1:
2453                 *pdata = -1ULL;
2454                 break;
2455         case MSR_IA32_VMX_CR4_FIXED0:
2456                 *pdata = VMXON_CR4_ALWAYSON;
2457                 break;
2458         case MSR_IA32_VMX_CR4_FIXED1:
2459                 *pdata = -1ULL;
2460                 break;
2461         case MSR_IA32_VMX_VMCS_ENUM:
2462                 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2463                 break;
2464         case MSR_IA32_VMX_PROCBASED_CTLS2:
2465                 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
2466                                         nested_vmx_secondary_ctls_high);
2467                 break;
2468         case MSR_IA32_VMX_EPT_VPID_CAP:
2469                 /* Currently, no nested vpid support */
2470                 *pdata = nested_vmx_ept_caps;
2471                 break;
2472         default:
2473                 return 1;
2474         }
2475
2476         return 0;
2477 }
2478
2479 /*
2480  * Reads an msr value (of 'msr_index') into 'pdata'.
2481  * Returns 0 on success, non-0 otherwise.
2482  * Assumes vcpu_load() was already called.
2483  */
2484 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2485 {
2486         u64 data;
2487         struct shared_msr_entry *msr;
2488
2489         if (!pdata) {
2490                 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2491                 return -EINVAL;
2492         }
2493
2494         switch (msr_index) {
2495 #ifdef CONFIG_X86_64
2496         case MSR_FS_BASE:
2497                 data = vmcs_readl(GUEST_FS_BASE);
2498                 break;
2499         case MSR_GS_BASE:
2500                 data = vmcs_readl(GUEST_GS_BASE);
2501                 break;
2502         case MSR_KERNEL_GS_BASE:
2503                 vmx_load_host_state(to_vmx(vcpu));
2504                 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2505                 break;
2506 #endif
2507         case MSR_EFER:
2508                 return kvm_get_msr_common(vcpu, msr_index, pdata);
2509         case MSR_IA32_TSC:
2510                 data = guest_read_tsc();
2511                 break;
2512         case MSR_IA32_SYSENTER_CS:
2513                 data = vmcs_read32(GUEST_SYSENTER_CS);
2514                 break;
2515         case MSR_IA32_SYSENTER_EIP:
2516                 data = vmcs_readl(GUEST_SYSENTER_EIP);
2517                 break;
2518         case MSR_IA32_SYSENTER_ESP:
2519                 data = vmcs_readl(GUEST_SYSENTER_ESP);
2520                 break;
2521         case MSR_IA32_BNDCFGS:
2522                 if (!vmx_mpx_supported())
2523                         return 1;
2524                 data = vmcs_read64(GUEST_BNDCFGS);
2525                 break;
2526         case MSR_IA32_FEATURE_CONTROL:
2527                 if (!nested_vmx_allowed(vcpu))
2528                         return 1;
2529                 data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2530                 break;
2531         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2532                 if (!nested_vmx_allowed(vcpu))
2533                         return 1;
2534                 return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2535         case MSR_TSC_AUX:
2536                 if (!to_vmx(vcpu)->rdtscp_enabled)
2537                         return 1;
2538                 /* Otherwise falls through */
2539         default:
2540                 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2541                 if (msr) {
2542                         data = msr->data;
2543                         break;
2544                 }
2545                 return kvm_get_msr_common(vcpu, msr_index, pdata);
2546         }
2547
2548         *pdata = data;
2549         return 0;
2550 }
2551
2552 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2553
2554 /*
2555  * Writes msr value into into the appropriate "register".
2556  * Returns 0 on success, non-0 otherwise.
2557  * Assumes vcpu_load() was already called.
2558  */
2559 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2560 {
2561         struct vcpu_vmx *vmx = to_vmx(vcpu);
2562         struct shared_msr_entry *msr;
2563         int ret = 0;
2564         u32 msr_index = msr_info->index;
2565         u64 data = msr_info->data;
2566
2567         switch (msr_index) {
2568         case MSR_EFER:
2569                 ret = kvm_set_msr_common(vcpu, msr_info);
2570                 break;
2571 #ifdef CONFIG_X86_64
2572         case MSR_FS_BASE:
2573                 vmx_segment_cache_clear(vmx);
2574                 vmcs_writel(GUEST_FS_BASE, data);
2575                 break;
2576         case MSR_GS_BASE:
2577                 vmx_segment_cache_clear(vmx);
2578                 vmcs_writel(GUEST_GS_BASE, data);
2579                 break;
2580         case MSR_KERNEL_GS_BASE:
2581                 vmx_load_host_state(vmx);
2582                 vmx->msr_guest_kernel_gs_base = data;
2583                 break;
2584 #endif
2585         case MSR_IA32_SYSENTER_CS:
2586                 vmcs_write32(GUEST_SYSENTER_CS, data);
2587                 break;
2588         case MSR_IA32_SYSENTER_EIP:
2589                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2590                 break;
2591         case MSR_IA32_SYSENTER_ESP:
2592                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2593                 break;
2594         case MSR_IA32_BNDCFGS:
2595                 if (!vmx_mpx_supported())
2596                         return 1;
2597                 vmcs_write64(GUEST_BNDCFGS, data);
2598                 break;
2599         case MSR_IA32_TSC:
2600                 kvm_write_tsc(vcpu, msr_info);
2601                 break;
2602         case MSR_IA32_CR_PAT:
2603                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2604                         vmcs_write64(GUEST_IA32_PAT, data);
2605                         vcpu->arch.pat = data;
2606                         break;
2607                 }
2608                 ret = kvm_set_msr_common(vcpu, msr_info);
2609                 break;
2610         case MSR_IA32_TSC_ADJUST:
2611                 ret = kvm_set_msr_common(vcpu, msr_info);
2612                 break;
2613         case MSR_IA32_FEATURE_CONTROL:
2614                 if (!nested_vmx_allowed(vcpu) ||
2615                     (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2616                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2617                         return 1;
2618                 vmx->nested.msr_ia32_feature_control = data;
2619                 if (msr_info->host_initiated && data == 0)
2620                         vmx_leave_nested(vcpu);
2621                 break;
2622         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2623                 return 1; /* they are read-only */
2624         case MSR_TSC_AUX:
2625                 if (!vmx->rdtscp_enabled)
2626                         return 1;
2627                 /* Check reserved bit, higher 32 bits should be zero */
2628                 if ((data >> 32) != 0)
2629                         return 1;
2630                 /* Otherwise falls through */
2631         default:
2632                 msr = find_msr_entry(vmx, msr_index);
2633                 if (msr) {
2634                         msr->data = data;
2635                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2636                                 preempt_disable();
2637                                 kvm_set_shared_msr(msr->index, msr->data,
2638                                                    msr->mask);
2639                                 preempt_enable();
2640                         }
2641                         break;
2642                 }
2643                 ret = kvm_set_msr_common(vcpu, msr_info);
2644         }
2645
2646         return ret;
2647 }
2648
2649 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2650 {
2651         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2652         switch (reg) {
2653         case VCPU_REGS_RSP:
2654                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2655                 break;
2656         case VCPU_REGS_RIP:
2657                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2658                 break;
2659         case VCPU_EXREG_PDPTR:
2660                 if (enable_ept)
2661                         ept_save_pdptrs(vcpu);
2662                 break;
2663         default:
2664                 break;
2665         }
2666 }
2667
2668 static __init int cpu_has_kvm_support(void)
2669 {
2670         return cpu_has_vmx();
2671 }
2672
2673 static __init int vmx_disabled_by_bios(void)
2674 {
2675         u64 msr;
2676
2677         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2678         if (msr & FEATURE_CONTROL_LOCKED) {
2679                 /* launched w/ TXT and VMX disabled */
2680                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2681                         && tboot_enabled())
2682                         return 1;
2683                 /* launched w/o TXT and VMX only enabled w/ TXT */
2684                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2685                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2686                         && !tboot_enabled()) {
2687                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2688                                 "activate TXT before enabling KVM\n");
2689                         return 1;
2690                 }
2691                 /* launched w/o TXT and VMX disabled */
2692                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2693                         && !tboot_enabled())
2694                         return 1;
2695         }
2696
2697         return 0;
2698 }
2699
2700 static void kvm_cpu_vmxon(u64 addr)
2701 {
2702         asm volatile (ASM_VMX_VMXON_RAX
2703                         : : "a"(&addr), "m"(addr)
2704                         : "memory", "cc");
2705 }
2706
2707 static int hardware_enable(void *garbage)
2708 {
2709         int cpu = raw_smp_processor_id();
2710         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2711         u64 old, test_bits;
2712
2713         if (read_cr4() & X86_CR4_VMXE)
2714                 return -EBUSY;
2715
2716         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2717
2718         /*
2719          * Now we can enable the vmclear operation in kdump
2720          * since the loaded_vmcss_on_cpu list on this cpu
2721          * has been initialized.
2722          *
2723          * Though the cpu is not in VMX operation now, there
2724          * is no problem to enable the vmclear operation
2725          * for the loaded_vmcss_on_cpu list is empty!
2726          */
2727         crash_enable_local_vmclear(cpu);
2728
2729         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2730
2731         test_bits = FEATURE_CONTROL_LOCKED;
2732         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2733         if (tboot_enabled())
2734                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2735
2736         if ((old & test_bits) != test_bits) {
2737                 /* enable and lock */
2738                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2739         }
2740         write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
2741
2742         if (vmm_exclusive) {
2743                 kvm_cpu_vmxon(phys_addr);
2744                 ept_sync_global();
2745         }
2746
2747         native_store_gdt(&__get_cpu_var(host_gdt));
2748
2749         return 0;
2750 }
2751
2752 static void vmclear_local_loaded_vmcss(void)
2753 {
2754         int cpu = raw_smp_processor_id();
2755         struct loaded_vmcs *v, *n;
2756
2757         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2758                                  loaded_vmcss_on_cpu_link)
2759                 __loaded_vmcs_clear(v);
2760 }
2761
2762
2763 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2764  * tricks.
2765  */
2766 static void kvm_cpu_vmxoff(void)
2767 {
2768         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
2769 }
2770
2771 static void hardware_disable(void *garbage)
2772 {
2773         if (vmm_exclusive) {
2774                 vmclear_local_loaded_vmcss();
2775                 kvm_cpu_vmxoff();
2776         }
2777         write_cr4(read_cr4() & ~X86_CR4_VMXE);
2778 }
2779
2780 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2781                                       u32 msr, u32 *result)
2782 {
2783         u32 vmx_msr_low, vmx_msr_high;
2784         u32 ctl = ctl_min | ctl_opt;
2785
2786         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2787
2788         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2789         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2790
2791         /* Ensure minimum (required) set of control bits are supported. */
2792         if (ctl_min & ~ctl)
2793                 return -EIO;
2794
2795         *result = ctl;
2796         return 0;
2797 }
2798
2799 static __init bool allow_1_setting(u32 msr, u32 ctl)
2800 {
2801         u32 vmx_msr_low, vmx_msr_high;
2802
2803         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2804         return vmx_msr_high & ctl;
2805 }
2806
2807 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2808 {
2809         u32 vmx_msr_low, vmx_msr_high;
2810         u32 min, opt, min2, opt2;
2811         u32 _pin_based_exec_control = 0;
2812         u32 _cpu_based_exec_control = 0;
2813         u32 _cpu_based_2nd_exec_control = 0;
2814         u32 _vmexit_control = 0;
2815         u32 _vmentry_control = 0;
2816
2817         min = CPU_BASED_HLT_EXITING |
2818 #ifdef CONFIG_X86_64
2819               CPU_BASED_CR8_LOAD_EXITING |
2820               CPU_BASED_CR8_STORE_EXITING |
2821 #endif
2822               CPU_BASED_CR3_LOAD_EXITING |
2823               CPU_BASED_CR3_STORE_EXITING |
2824               CPU_BASED_USE_IO_BITMAPS |
2825               CPU_BASED_MOV_DR_EXITING |
2826               CPU_BASED_USE_TSC_OFFSETING |
2827               CPU_BASED_MWAIT_EXITING |
2828               CPU_BASED_MONITOR_EXITING |
2829               CPU_BASED_INVLPG_EXITING |
2830               CPU_BASED_RDPMC_EXITING;
2831
2832         opt = CPU_BASED_TPR_SHADOW |
2833               CPU_BASED_USE_MSR_BITMAPS |
2834               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2835         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2836                                 &_cpu_based_exec_control) < 0)
2837                 return -EIO;
2838 #ifdef CONFIG_X86_64
2839         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2840                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2841                                            ~CPU_BASED_CR8_STORE_EXITING;
2842 #endif
2843         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2844                 min2 = 0;
2845                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2846                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2847                         SECONDARY_EXEC_WBINVD_EXITING |
2848                         SECONDARY_EXEC_ENABLE_VPID |
2849                         SECONDARY_EXEC_ENABLE_EPT |
2850                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
2851                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2852                         SECONDARY_EXEC_RDTSCP |
2853                         SECONDARY_EXEC_ENABLE_INVPCID |
2854                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
2855                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2856                         SECONDARY_EXEC_SHADOW_VMCS;
2857                 if (adjust_vmx_controls(min2, opt2,
2858                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2859                                         &_cpu_based_2nd_exec_control) < 0)
2860                         return -EIO;
2861         }
2862 #ifndef CONFIG_X86_64
2863         if (!(_cpu_based_2nd_exec_control &
2864                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2865                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2866 #endif
2867
2868         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2869                 _cpu_based_2nd_exec_control &= ~(
2870                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2871                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2872                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2873
2874         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2875                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2876                    enabled */
2877                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2878                                              CPU_BASED_CR3_STORE_EXITING |
2879                                              CPU_BASED_INVLPG_EXITING);
2880                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
2881                       vmx_capability.ept, vmx_capability.vpid);
2882         }
2883
2884         min = VM_EXIT_SAVE_DEBUG_CONTROLS;
2885 #ifdef CONFIG_X86_64
2886         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2887 #endif
2888         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
2889                 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
2890         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2891                                 &_vmexit_control) < 0)
2892                 return -EIO;
2893
2894         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2895         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
2896         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2897                                 &_pin_based_exec_control) < 0)
2898                 return -EIO;
2899
2900         if (!(_cpu_based_2nd_exec_control &
2901                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
2902                 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
2903                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2904
2905         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2906         opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
2907         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2908                                 &_vmentry_control) < 0)
2909                 return -EIO;
2910
2911         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2912
2913         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2914         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2915                 return -EIO;
2916
2917 #ifdef CONFIG_X86_64
2918         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2919         if (vmx_msr_high & (1u<<16))
2920                 return -EIO;
2921 #endif
2922
2923         /* Require Write-Back (WB) memory type for VMCS accesses. */
2924         if (((vmx_msr_high >> 18) & 15) != 6)
2925                 return -EIO;
2926
2927         vmcs_conf->size = vmx_msr_high & 0x1fff;
2928         vmcs_conf->order = get_order(vmcs_config.size);
2929         vmcs_conf->revision_id = vmx_msr_low;
2930
2931         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2932         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2933         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2934         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2935         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2936
2937         cpu_has_load_ia32_efer =
2938                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2939                                 VM_ENTRY_LOAD_IA32_EFER)
2940                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2941                                    VM_EXIT_LOAD_IA32_EFER);
2942
2943         cpu_has_load_perf_global_ctrl =
2944                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2945                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
2946                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2947                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2948
2949         /*
2950          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
2951          * but due to arrata below it can't be used. Workaround is to use
2952          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2953          *
2954          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
2955          *
2956          * AAK155             (model 26)
2957          * AAP115             (model 30)
2958          * AAT100             (model 37)
2959          * BC86,AAY89,BD102   (model 44)
2960          * BA97               (model 46)
2961          *
2962          */
2963         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
2964                 switch (boot_cpu_data.x86_model) {
2965                 case 26:
2966                 case 30:
2967                 case 37:
2968                 case 44:
2969                 case 46:
2970                         cpu_has_load_perf_global_ctrl = false;
2971                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2972                                         "does not work properly. Using workaround\n");
2973                         break;
2974                 default:
2975                         break;
2976                 }
2977         }
2978
2979         return 0;
2980 }
2981
2982 static struct vmcs *alloc_vmcs_cpu(int cpu)
2983 {
2984         int node = cpu_to_node(cpu);
2985         struct page *pages;
2986         struct vmcs *vmcs;
2987
2988         pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
2989         if (!pages)
2990                 return NULL;
2991         vmcs = page_address(pages);
2992         memset(vmcs, 0, vmcs_config.size);
2993         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
2994         return vmcs;
2995 }
2996
2997 static struct vmcs *alloc_vmcs(void)
2998 {
2999         return alloc_vmcs_cpu(raw_smp_processor_id());
3000 }
3001
3002 static void free_vmcs(struct vmcs *vmcs)
3003 {
3004         free_pages((unsigned long)vmcs, vmcs_config.order);
3005 }
3006
3007 /*
3008  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3009  */
3010 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3011 {
3012         if (!loaded_vmcs->vmcs)
3013                 return;
3014         loaded_vmcs_clear(loaded_vmcs);
3015         free_vmcs(loaded_vmcs->vmcs);
3016         loaded_vmcs->vmcs = NULL;
3017 }
3018
3019 static void free_kvm_area(void)
3020 {
3021         int cpu;
3022
3023         for_each_possible_cpu(cpu) {
3024                 free_vmcs(per_cpu(vmxarea, cpu));
3025                 per_cpu(vmxarea, cpu) = NULL;
3026         }
3027 }
3028
3029 static void init_vmcs_shadow_fields(void)
3030 {
3031         int i, j;
3032
3033         /* No checks for read only fields yet */
3034
3035         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
3036                 switch (shadow_read_write_fields[i]) {
3037                 case GUEST_BNDCFGS:
3038                         if (!vmx_mpx_supported())
3039                                 continue;
3040                         break;
3041                 default:
3042                         break;
3043                 }
3044
3045                 if (j < i)
3046                         shadow_read_write_fields[j] =
3047                                 shadow_read_write_fields[i];
3048                 j++;
3049         }
3050         max_shadow_read_write_fields = j;
3051
3052         /* shadowed fields guest access without vmexit */
3053         for (i = 0; i < max_shadow_read_write_fields; i++) {
3054                 clear_bit(shadow_read_write_fields[i],
3055                           vmx_vmwrite_bitmap);
3056                 clear_bit(shadow_read_write_fields[i],
3057                           vmx_vmread_bitmap);
3058         }
3059         for (i = 0; i < max_shadow_read_only_fields; i++)
3060                 clear_bit(shadow_read_only_fields[i],
3061                           vmx_vmread_bitmap);
3062 }
3063
3064 static __init int alloc_kvm_area(void)
3065 {
3066         int cpu;
3067
3068         for_each_possible_cpu(cpu) {
3069                 struct vmcs *vmcs;
3070
3071                 vmcs = alloc_vmcs_cpu(cpu);
3072                 if (!vmcs) {
3073                         free_kvm_area();
3074                         return -ENOMEM;
3075                 }
3076
3077                 per_cpu(vmxarea, cpu) = vmcs;
3078         }
3079         return 0;
3080 }
3081
3082 static __init int hardware_setup(void)
3083 {
3084         if (setup_vmcs_config(&vmcs_config) < 0)
3085                 return -EIO;
3086
3087         if (boot_cpu_has(X86_FEATURE_NX))
3088                 kvm_enable_efer_bits(EFER_NX);
3089
3090         if (!cpu_has_vmx_vpid())
3091                 enable_vpid = 0;
3092         if (!cpu_has_vmx_shadow_vmcs())
3093                 enable_shadow_vmcs = 0;
3094         if (enable_shadow_vmcs)
3095                 init_vmcs_shadow_fields();
3096
3097         if (!cpu_has_vmx_ept() ||
3098             !cpu_has_vmx_ept_4levels()) {
3099                 enable_ept = 0;
3100                 enable_unrestricted_guest = 0;
3101                 enable_ept_ad_bits = 0;
3102         }
3103
3104         if (!cpu_has_vmx_ept_ad_bits())
3105                 enable_ept_ad_bits = 0;
3106
3107         if (!cpu_has_vmx_unrestricted_guest())
3108                 enable_unrestricted_guest = 0;
3109
3110         if (!cpu_has_vmx_flexpriority())
3111                 flexpriority_enabled = 0;
3112
3113         if (!cpu_has_vmx_tpr_shadow())
3114                 kvm_x86_ops->update_cr8_intercept = NULL;
3115
3116         if (enable_ept && !cpu_has_vmx_ept_2m_page())
3117                 kvm_disable_largepages();
3118
3119         if (!cpu_has_vmx_ple())
3120                 ple_gap = 0;
3121
3122         if (!cpu_has_vmx_apicv())
3123                 enable_apicv = 0;
3124
3125         if (enable_apicv)
3126                 kvm_x86_ops->update_cr8_intercept = NULL;
3127         else {
3128                 kvm_x86_ops->hwapic_irr_update = NULL;
3129                 kvm_x86_ops->deliver_posted_interrupt = NULL;
3130                 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
3131         }
3132
3133         if (nested)
3134                 nested_vmx_setup_ctls_msrs();
3135
3136         return alloc_kvm_area();
3137 }
3138
3139 static __exit void hardware_unsetup(void)
3140 {
3141         free_kvm_area();
3142 }
3143
3144 static bool emulation_required(struct kvm_vcpu *vcpu)
3145 {
3146         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3147 }
3148
3149 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3150                 struct kvm_segment *save)
3151 {
3152         if (!emulate_invalid_guest_state) {
3153                 /*
3154                  * CS and SS RPL should be equal during guest entry according
3155                  * to VMX spec, but in reality it is not always so. Since vcpu
3156                  * is in the middle of the transition from real mode to
3157                  * protected mode it is safe to assume that RPL 0 is a good
3158                  * default value.
3159                  */
3160                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3161                         save->selector &= ~SELECTOR_RPL_MASK;
3162                 save->dpl = save->selector & SELECTOR_RPL_MASK;
3163                 save->s = 1;
3164         }
3165         vmx_set_segment(vcpu, save, seg);
3166 }
3167
3168 static void enter_pmode(struct kvm_vcpu *vcpu)
3169 {
3170         unsigned long flags;
3171         struct vcpu_vmx *vmx = to_vmx(vcpu);
3172
3173         /*
3174          * Update real mode segment cache. It may be not up-to-date if sement
3175          * register was written while vcpu was in a guest mode.
3176          */
3177         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3178         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3179         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3180         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3181         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3182         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3183
3184         vmx->rmode.vm86_active = 0;
3185
3186         vmx_segment_cache_clear(vmx);
3187
3188         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3189
3190         flags = vmcs_readl(GUEST_RFLAGS);
3191         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3192         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3193         vmcs_writel(GUEST_RFLAGS, flags);
3194
3195         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3196                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3197
3198         update_exception_bitmap(vcpu);
3199
3200         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3201         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3202         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3203         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3204         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3205         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3206 }
3207
3208 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3209 {
3210         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3211         struct kvm_segment var = *save;
3212
3213         var.dpl = 0x3;
3214         if (seg == VCPU_SREG_CS)
3215                 var.type = 0x3;
3216
3217         if (!emulate_invalid_guest_state) {
3218                 var.selector = var.base >> 4;
3219                 var.base = var.base & 0xffff0;
3220                 var.limit = 0xffff;
3221                 var.g = 0;
3222                 var.db = 0;
3223                 var.present = 1;
3224                 var.s = 1;
3225                 var.l = 0;
3226                 var.unusable = 0;
3227                 var.type = 0x3;
3228                 var.avl = 0;
3229                 if (save->base & 0xf)
3230                         printk_once(KERN_WARNING "kvm: segment base is not "
3231                                         "paragraph aligned when entering "
3232                                         "protected mode (seg=%d)", seg);
3233         }
3234
3235         vmcs_write16(sf->selector, var.selector);
3236         vmcs_write32(sf->base, var.base);
3237         vmcs_write32(sf->limit, var.limit);
3238         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3239 }
3240
3241 static void enter_rmode(struct kvm_vcpu *vcpu)
3242 {
3243         unsigned long flags;
3244         struct vcpu_vmx *vmx = to_vmx(vcpu);
3245
3246         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3247         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3248         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3249         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3250         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3251         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3252         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3253
3254         vmx->rmode.vm86_active = 1;
3255
3256         /*
3257          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3258          * vcpu. Warn the user that an update is overdue.
3259          */
3260         if (!vcpu->kvm->arch.tss_addr)
3261                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3262                              "called before entering vcpu\n");
3263
3264         vmx_segment_cache_clear(vmx);
3265
3266         vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
3267         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3268         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3269
3270         flags = vmcs_readl(GUEST_RFLAGS);
3271         vmx->rmode.save_rflags = flags;
3272
3273         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3274
3275         vmcs_writel(GUEST_RFLAGS, flags);
3276         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3277         update_exception_bitmap(vcpu);
3278
3279         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3280         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3281         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3282         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3283         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3284         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3285
3286         kvm_mmu_reset_context(vcpu);
3287 }
3288
3289 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3290 {
3291         struct vcpu_vmx *vmx = to_vmx(vcpu);
3292         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3293
3294         if (!msr)
3295                 return;
3296
3297         /*
3298          * Force kernel_gs_base reloading before EFER changes, as control
3299          * of this msr depends on is_long_mode().
3300          */
3301         vmx_load_host_state(to_vmx(vcpu));
3302         vcpu->arch.efer = efer;
3303         if (efer & EFER_LMA) {
3304                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3305                 msr->data = efer;
3306         } else {
3307                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3308
3309                 msr->data = efer & ~EFER_LME;
3310         }
3311         setup_msrs(vmx);
3312 }
3313
3314 #ifdef CONFIG_X86_64
3315
3316 static void enter_lmode(struct kvm_vcpu *vcpu)
3317 {
3318         u32 guest_tr_ar;
3319
3320         vmx_segment_cache_clear(to_vmx(vcpu));
3321
3322         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3323         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
3324                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3325                                      __func__);
3326                 vmcs_write32(GUEST_TR_AR_BYTES,
3327                              (guest_tr_ar & ~AR_TYPE_MASK)
3328                              | AR_TYPE_BUSY_64_TSS);
3329         }
3330         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3331 }
3332
3333 static void exit_lmode(struct kvm_vcpu *vcpu)
3334 {
3335         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3336         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3337 }
3338
3339 #endif
3340
3341 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3342 {
3343         vpid_sync_context(to_vmx(vcpu));
3344         if (enable_ept) {
3345                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3346                         return;
3347                 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
3348         }
3349 }
3350
3351 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3352 {
3353         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3354
3355         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3356         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3357 }
3358
3359 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
3360 {
3361         if (enable_ept && is_paging(vcpu))
3362                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3363         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3364 }
3365
3366 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3367 {
3368         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3369
3370         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3371         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3372 }
3373
3374 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3375 {
3376         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3377
3378         if (!test_bit(VCPU_EXREG_PDPTR,
3379                       (unsigned long *)&vcpu->arch.regs_dirty))
3380                 return;
3381
3382         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3383                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3384                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3385                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3386                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3387         }
3388 }
3389
3390 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3391 {
3392         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3393
3394         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3395                 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3396                 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3397                 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3398                 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3399         }
3400
3401         __set_bit(VCPU_EXREG_PDPTR,