2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
23 #include <linux/kvm_host.h>
24 #include <linux/module.h>
25 #include <linux/kernel.h>
27 #include <linux/highmem.h>
28 #include <linux/sched.h>
29 #include <linux/moduleparam.h>
30 #include <linux/mod_devicetable.h>
31 #include <linux/ftrace_event.h>
32 #include <linux/slab.h>
33 #include <linux/tboot.h>
34 #include <linux/hrtimer.h>
35 #include "kvm_cache_regs.h"
41 #include <asm/virtext.h>
45 #include <asm/perf_event.h>
46 #include <asm/debugreg.h>
47 #include <asm/kexec.h>
51 #define __ex(x) __kvm_handle_fault_on_reboot(x)
52 #define __ex_clear(x, reg) \
53 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
55 MODULE_AUTHOR("Qumranet");
56 MODULE_LICENSE("GPL");
58 static const struct x86_cpu_id vmx_cpu_id[] = {
59 X86_FEATURE_MATCH(X86_FEATURE_VMX),
62 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
64 static bool __read_mostly enable_vpid = 1;
65 module_param_named(vpid, enable_vpid, bool, 0444);
67 static bool __read_mostly flexpriority_enabled = 1;
68 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
70 static bool __read_mostly enable_ept = 1;
71 module_param_named(ept, enable_ept, bool, S_IRUGO);
73 static bool __read_mostly enable_unrestricted_guest = 1;
74 module_param_named(unrestricted_guest,
75 enable_unrestricted_guest, bool, S_IRUGO);
77 static bool __read_mostly enable_ept_ad_bits = 1;
78 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
80 static bool __read_mostly emulate_invalid_guest_state = true;
81 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
83 static bool __read_mostly vmm_exclusive = 1;
84 module_param(vmm_exclusive, bool, S_IRUGO);
86 static bool __read_mostly fasteoi = 1;
87 module_param(fasteoi, bool, S_IRUGO);
89 static bool __read_mostly enable_apicv = 1;
90 module_param(enable_apicv, bool, S_IRUGO);
92 static bool __read_mostly enable_shadow_vmcs = 1;
93 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
95 * If nested=1, nested virtualization is supported, i.e., guests may use
96 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
97 * use VMX instructions.
99 static bool __read_mostly nested = 0;
100 module_param(nested, bool, S_IRUGO);
102 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
103 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
104 #define KVM_VM_CR0_ALWAYS_ON \
105 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
106 #define KVM_CR4_GUEST_OWNED_BITS \
107 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
108 | X86_CR4_OSXMMEXCPT)
110 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
111 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
113 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
115 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
118 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
119 * ple_gap: upper bound on the amount of time between two successive
120 * executions of PAUSE in a loop. Also indicate if ple enabled.
121 * According to test, this time is usually smaller than 128 cycles.
122 * ple_window: upper bound on the amount of time a guest is allowed to execute
123 * in a PAUSE loop. Tests indicate that most spinlocks are held for
124 * less than 2^12 cycles
125 * Time is measured based on a counter that runs at the same rate as the TSC,
126 * refer SDM volume 3b section 21.6.13 & 22.1.3.
128 #define KVM_VMX_DEFAULT_PLE_GAP 128
129 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096
130 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
131 module_param(ple_gap, int, S_IRUGO);
133 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
134 module_param(ple_window, int, S_IRUGO);
136 extern const ulong vmx_return;
138 #define NR_AUTOLOAD_MSRS 8
139 #define VMCS02_POOL_SIZE 1
148 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
149 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
150 * loaded on this CPU (so we can clear them if the CPU goes down).
156 struct list_head loaded_vmcss_on_cpu_link;
159 struct shared_msr_entry {
166 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
167 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
168 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
169 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
170 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
171 * More than one of these structures may exist, if L1 runs multiple L2 guests.
172 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
173 * underlying hardware which will be used to run L2.
174 * This structure is packed to ensure that its layout is identical across
175 * machines (necessary for live migration).
176 * If there are changes in this struct, VMCS12_REVISION must be changed.
178 typedef u64 natural_width;
179 struct __packed vmcs12 {
180 /* According to the Intel spec, a VMCS region must start with the
181 * following two fields. Then follow implementation-specific data.
186 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
187 u32 padding[7]; /* room for future expansion */
192 u64 vm_exit_msr_store_addr;
193 u64 vm_exit_msr_load_addr;
194 u64 vm_entry_msr_load_addr;
196 u64 virtual_apic_page_addr;
197 u64 apic_access_addr;
199 u64 guest_physical_address;
200 u64 vmcs_link_pointer;
201 u64 guest_ia32_debugctl;
204 u64 guest_ia32_perf_global_ctrl;
212 u64 host_ia32_perf_global_ctrl;
213 u64 padding64[8]; /* room for future expansion */
215 * To allow migration of L1 (complete with its L2 guests) between
216 * machines of different natural widths (32 or 64 bit), we cannot have
217 * unsigned long fields with no explict size. We use u64 (aliased
218 * natural_width) instead. Luckily, x86 is little-endian.
220 natural_width cr0_guest_host_mask;
221 natural_width cr4_guest_host_mask;
222 natural_width cr0_read_shadow;
223 natural_width cr4_read_shadow;
224 natural_width cr3_target_value0;
225 natural_width cr3_target_value1;
226 natural_width cr3_target_value2;
227 natural_width cr3_target_value3;
228 natural_width exit_qualification;
229 natural_width guest_linear_address;
230 natural_width guest_cr0;
231 natural_width guest_cr3;
232 natural_width guest_cr4;
233 natural_width guest_es_base;
234 natural_width guest_cs_base;
235 natural_width guest_ss_base;
236 natural_width guest_ds_base;
237 natural_width guest_fs_base;
238 natural_width guest_gs_base;
239 natural_width guest_ldtr_base;
240 natural_width guest_tr_base;
241 natural_width guest_gdtr_base;
242 natural_width guest_idtr_base;
243 natural_width guest_dr7;
244 natural_width guest_rsp;
245 natural_width guest_rip;
246 natural_width guest_rflags;
247 natural_width guest_pending_dbg_exceptions;
248 natural_width guest_sysenter_esp;
249 natural_width guest_sysenter_eip;
250 natural_width host_cr0;
251 natural_width host_cr3;
252 natural_width host_cr4;
253 natural_width host_fs_base;
254 natural_width host_gs_base;
255 natural_width host_tr_base;
256 natural_width host_gdtr_base;
257 natural_width host_idtr_base;
258 natural_width host_ia32_sysenter_esp;
259 natural_width host_ia32_sysenter_eip;
260 natural_width host_rsp;
261 natural_width host_rip;
262 natural_width paddingl[8]; /* room for future expansion */
263 u32 pin_based_vm_exec_control;
264 u32 cpu_based_vm_exec_control;
265 u32 exception_bitmap;
266 u32 page_fault_error_code_mask;
267 u32 page_fault_error_code_match;
268 u32 cr3_target_count;
269 u32 vm_exit_controls;
270 u32 vm_exit_msr_store_count;
271 u32 vm_exit_msr_load_count;
272 u32 vm_entry_controls;
273 u32 vm_entry_msr_load_count;
274 u32 vm_entry_intr_info_field;
275 u32 vm_entry_exception_error_code;
276 u32 vm_entry_instruction_len;
278 u32 secondary_vm_exec_control;
279 u32 vm_instruction_error;
281 u32 vm_exit_intr_info;
282 u32 vm_exit_intr_error_code;
283 u32 idt_vectoring_info_field;
284 u32 idt_vectoring_error_code;
285 u32 vm_exit_instruction_len;
286 u32 vmx_instruction_info;
293 u32 guest_ldtr_limit;
295 u32 guest_gdtr_limit;
296 u32 guest_idtr_limit;
297 u32 guest_es_ar_bytes;
298 u32 guest_cs_ar_bytes;
299 u32 guest_ss_ar_bytes;
300 u32 guest_ds_ar_bytes;
301 u32 guest_fs_ar_bytes;
302 u32 guest_gs_ar_bytes;
303 u32 guest_ldtr_ar_bytes;
304 u32 guest_tr_ar_bytes;
305 u32 guest_interruptibility_info;
306 u32 guest_activity_state;
307 u32 guest_sysenter_cs;
308 u32 host_ia32_sysenter_cs;
309 u32 vmx_preemption_timer_value;
310 u32 padding32[7]; /* room for future expansion */
311 u16 virtual_processor_id;
312 u16 guest_es_selector;
313 u16 guest_cs_selector;
314 u16 guest_ss_selector;
315 u16 guest_ds_selector;
316 u16 guest_fs_selector;
317 u16 guest_gs_selector;
318 u16 guest_ldtr_selector;
319 u16 guest_tr_selector;
320 u16 host_es_selector;
321 u16 host_cs_selector;
322 u16 host_ss_selector;
323 u16 host_ds_selector;
324 u16 host_fs_selector;
325 u16 host_gs_selector;
326 u16 host_tr_selector;
330 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
331 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
332 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
334 #define VMCS12_REVISION 0x11e57ed0
337 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
338 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
339 * current implementation, 4K are reserved to avoid future complications.
341 #define VMCS12_SIZE 0x1000
343 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
345 struct list_head list;
347 struct loaded_vmcs vmcs02;
351 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
352 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
355 /* Has the level1 guest done vmxon? */
359 /* The guest-physical address of the current VMCS L1 keeps for L2 */
361 /* The host-usable pointer to the above */
362 struct page *current_vmcs12_page;
363 struct vmcs12 *current_vmcs12;
364 struct vmcs *current_shadow_vmcs;
366 * Indicates if the shadow vmcs must be updated with the
367 * data hold by vmcs12
369 bool sync_shadow_vmcs;
371 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
372 struct list_head vmcs02_pool;
374 u64 vmcs01_tsc_offset;
375 /* L2 must run next, and mustn't decide to exit to L1. */
376 bool nested_run_pending;
378 * Guest pages referred to in vmcs02 with host-physical pointers, so
379 * we must keep them pinned while L2 runs.
381 struct page *apic_access_page;
382 u64 msr_ia32_feature_control;
384 struct hrtimer preemption_timer;
385 bool preemption_timer_expired;
387 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
391 #define POSTED_INTR_ON 0
392 /* Posted-Interrupt Descriptor */
394 u32 pir[8]; /* Posted interrupt requested */
395 u32 control; /* bit 0 of control is outstanding notification bit */
399 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
401 return test_and_set_bit(POSTED_INTR_ON,
402 (unsigned long *)&pi_desc->control);
405 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
407 return test_and_clear_bit(POSTED_INTR_ON,
408 (unsigned long *)&pi_desc->control);
411 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
413 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
417 struct kvm_vcpu vcpu;
418 unsigned long host_rsp;
420 bool nmi_known_unmasked;
422 u32 idt_vectoring_info;
424 struct shared_msr_entry *guest_msrs;
427 unsigned long host_idt_base;
429 u64 msr_host_kernel_gs_base;
430 u64 msr_guest_kernel_gs_base;
432 u32 vm_entry_controls_shadow;
433 u32 vm_exit_controls_shadow;
435 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
436 * non-nested (L1) guest, it always points to vmcs01. For a nested
437 * guest (L2), it points to a different VMCS.
439 struct loaded_vmcs vmcs01;
440 struct loaded_vmcs *loaded_vmcs;
441 bool __launched; /* temporary, used in vmx_vcpu_run */
442 struct msr_autoload {
444 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
445 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
449 u16 fs_sel, gs_sel, ldt_sel;
453 int gs_ldt_reload_needed;
454 int fs_reload_needed;
455 u64 msr_host_bndcfgs;
460 struct kvm_segment segs[8];
463 u32 bitmask; /* 4 bits per segment (1 bit per field) */
464 struct kvm_save_segment {
472 bool emulation_required;
474 /* Support for vnmi-less CPUs */
475 int soft_vnmi_blocked;
477 s64 vnmi_blocked_time;
482 /* Posted interrupt descriptor */
483 struct pi_desc pi_desc;
485 /* Support for a guest hypervisor (nested VMX) */
486 struct nested_vmx nested;
489 enum segment_cache_field {
498 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
500 return container_of(vcpu, struct vcpu_vmx, vcpu);
503 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
504 #define FIELD(number, name) [number] = VMCS12_OFFSET(name)
505 #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
506 [number##_HIGH] = VMCS12_OFFSET(name)+4
509 static unsigned long shadow_read_only_fields[] = {
511 * We do NOT shadow fields that are modified when L0
512 * traps and emulates any vmx instruction (e.g. VMPTRLD,
513 * VMXON...) executed by L1.
514 * For example, VM_INSTRUCTION_ERROR is read
515 * by L1 if a vmx instruction fails (part of the error path).
516 * Note the code assumes this logic. If for some reason
517 * we start shadowing these fields then we need to
518 * force a shadow sync when L0 emulates vmx instructions
519 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
520 * by nested_vmx_failValid)
524 VM_EXIT_INSTRUCTION_LEN,
525 IDT_VECTORING_INFO_FIELD,
526 IDT_VECTORING_ERROR_CODE,
527 VM_EXIT_INTR_ERROR_CODE,
529 GUEST_LINEAR_ADDRESS,
530 GUEST_PHYSICAL_ADDRESS
532 static int max_shadow_read_only_fields =
533 ARRAY_SIZE(shadow_read_only_fields);
535 static unsigned long shadow_read_write_fields[] = {
541 GUEST_INTERRUPTIBILITY_INFO,
554 CPU_BASED_VM_EXEC_CONTROL,
555 VM_ENTRY_EXCEPTION_ERROR_CODE,
556 VM_ENTRY_INTR_INFO_FIELD,
557 VM_ENTRY_INSTRUCTION_LEN,
558 VM_ENTRY_EXCEPTION_ERROR_CODE,
564 static int max_shadow_read_write_fields =
565 ARRAY_SIZE(shadow_read_write_fields);
567 static const unsigned short vmcs_field_to_offset_table[] = {
568 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
569 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
570 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
571 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
572 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
573 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
574 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
575 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
576 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
577 FIELD(HOST_ES_SELECTOR, host_es_selector),
578 FIELD(HOST_CS_SELECTOR, host_cs_selector),
579 FIELD(HOST_SS_SELECTOR, host_ss_selector),
580 FIELD(HOST_DS_SELECTOR, host_ds_selector),
581 FIELD(HOST_FS_SELECTOR, host_fs_selector),
582 FIELD(HOST_GS_SELECTOR, host_gs_selector),
583 FIELD(HOST_TR_SELECTOR, host_tr_selector),
584 FIELD64(IO_BITMAP_A, io_bitmap_a),
585 FIELD64(IO_BITMAP_B, io_bitmap_b),
586 FIELD64(MSR_BITMAP, msr_bitmap),
587 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
588 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
589 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
590 FIELD64(TSC_OFFSET, tsc_offset),
591 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
592 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
593 FIELD64(EPT_POINTER, ept_pointer),
594 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
595 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
596 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
597 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
598 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
599 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
600 FIELD64(GUEST_PDPTR0, guest_pdptr0),
601 FIELD64(GUEST_PDPTR1, guest_pdptr1),
602 FIELD64(GUEST_PDPTR2, guest_pdptr2),
603 FIELD64(GUEST_PDPTR3, guest_pdptr3),
604 FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
605 FIELD64(HOST_IA32_PAT, host_ia32_pat),
606 FIELD64(HOST_IA32_EFER, host_ia32_efer),
607 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
608 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
609 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
610 FIELD(EXCEPTION_BITMAP, exception_bitmap),
611 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
612 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
613 FIELD(CR3_TARGET_COUNT, cr3_target_count),
614 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
615 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
616 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
617 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
618 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
619 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
620 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
621 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
622 FIELD(TPR_THRESHOLD, tpr_threshold),
623 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
624 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
625 FIELD(VM_EXIT_REASON, vm_exit_reason),
626 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
627 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
628 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
629 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
630 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
631 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
632 FIELD(GUEST_ES_LIMIT, guest_es_limit),
633 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
634 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
635 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
636 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
637 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
638 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
639 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
640 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
641 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
642 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
643 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
644 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
645 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
646 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
647 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
648 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
649 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
650 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
651 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
652 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
653 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
654 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
655 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
656 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
657 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
658 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
659 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
660 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
661 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
662 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
663 FIELD(EXIT_QUALIFICATION, exit_qualification),
664 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
665 FIELD(GUEST_CR0, guest_cr0),
666 FIELD(GUEST_CR3, guest_cr3),
667 FIELD(GUEST_CR4, guest_cr4),
668 FIELD(GUEST_ES_BASE, guest_es_base),
669 FIELD(GUEST_CS_BASE, guest_cs_base),
670 FIELD(GUEST_SS_BASE, guest_ss_base),
671 FIELD(GUEST_DS_BASE, guest_ds_base),
672 FIELD(GUEST_FS_BASE, guest_fs_base),
673 FIELD(GUEST_GS_BASE, guest_gs_base),
674 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
675 FIELD(GUEST_TR_BASE, guest_tr_base),
676 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
677 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
678 FIELD(GUEST_DR7, guest_dr7),
679 FIELD(GUEST_RSP, guest_rsp),
680 FIELD(GUEST_RIP, guest_rip),
681 FIELD(GUEST_RFLAGS, guest_rflags),
682 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
683 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
684 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
685 FIELD(HOST_CR0, host_cr0),
686 FIELD(HOST_CR3, host_cr3),
687 FIELD(HOST_CR4, host_cr4),
688 FIELD(HOST_FS_BASE, host_fs_base),
689 FIELD(HOST_GS_BASE, host_gs_base),
690 FIELD(HOST_TR_BASE, host_tr_base),
691 FIELD(HOST_GDTR_BASE, host_gdtr_base),
692 FIELD(HOST_IDTR_BASE, host_idtr_base),
693 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
694 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
695 FIELD(HOST_RSP, host_rsp),
696 FIELD(HOST_RIP, host_rip),
698 static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
700 static inline short vmcs_field_to_offset(unsigned long field)
702 if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
704 return vmcs_field_to_offset_table[field];
707 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
709 return to_vmx(vcpu)->nested.current_vmcs12;
712 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
714 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
715 if (is_error_page(page))
721 static void nested_release_page(struct page *page)
723 kvm_release_page_dirty(page);
726 static void nested_release_page_clean(struct page *page)
728 kvm_release_page_clean(page);
731 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
732 static u64 construct_eptp(unsigned long root_hpa);
733 static void kvm_cpu_vmxon(u64 addr);
734 static void kvm_cpu_vmxoff(void);
735 static bool vmx_mpx_supported(void);
736 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
737 static void vmx_set_segment(struct kvm_vcpu *vcpu,
738 struct kvm_segment *var, int seg);
739 static void vmx_get_segment(struct kvm_vcpu *vcpu,
740 struct kvm_segment *var, int seg);
741 static bool guest_state_valid(struct kvm_vcpu *vcpu);
742 static u32 vmx_segment_access_rights(struct kvm_segment *var);
743 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
744 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
745 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
747 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
748 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
750 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
751 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
753 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
754 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
756 static unsigned long *vmx_io_bitmap_a;
757 static unsigned long *vmx_io_bitmap_b;
758 static unsigned long *vmx_msr_bitmap_legacy;
759 static unsigned long *vmx_msr_bitmap_longmode;
760 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
761 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
762 static unsigned long *vmx_vmread_bitmap;
763 static unsigned long *vmx_vmwrite_bitmap;
765 static bool cpu_has_load_ia32_efer;
766 static bool cpu_has_load_perf_global_ctrl;
768 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
769 static DEFINE_SPINLOCK(vmx_vpid_lock);
771 static struct vmcs_config {
775 u32 pin_based_exec_ctrl;
776 u32 cpu_based_exec_ctrl;
777 u32 cpu_based_2nd_exec_ctrl;
782 static struct vmx_capability {
787 #define VMX_SEGMENT_FIELD(seg) \
788 [VCPU_SREG_##seg] = { \
789 .selector = GUEST_##seg##_SELECTOR, \
790 .base = GUEST_##seg##_BASE, \
791 .limit = GUEST_##seg##_LIMIT, \
792 .ar_bytes = GUEST_##seg##_AR_BYTES, \
795 static const struct kvm_vmx_segment_field {
800 } kvm_vmx_segment_fields[] = {
801 VMX_SEGMENT_FIELD(CS),
802 VMX_SEGMENT_FIELD(DS),
803 VMX_SEGMENT_FIELD(ES),
804 VMX_SEGMENT_FIELD(FS),
805 VMX_SEGMENT_FIELD(GS),
806 VMX_SEGMENT_FIELD(SS),
807 VMX_SEGMENT_FIELD(TR),
808 VMX_SEGMENT_FIELD(LDTR),
811 static u64 host_efer;
813 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
816 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
817 * away by decrementing the array size.
819 static const u32 vmx_msr_index[] = {
821 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
823 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
826 static inline bool is_page_fault(u32 intr_info)
828 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
829 INTR_INFO_VALID_MASK)) ==
830 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
833 static inline bool is_no_device(u32 intr_info)
835 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
836 INTR_INFO_VALID_MASK)) ==
837 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
840 static inline bool is_invalid_opcode(u32 intr_info)
842 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
843 INTR_INFO_VALID_MASK)) ==
844 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
847 static inline bool is_external_interrupt(u32 intr_info)
849 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
850 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
853 static inline bool is_machine_check(u32 intr_info)
855 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
856 INTR_INFO_VALID_MASK)) ==
857 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
860 static inline bool cpu_has_vmx_msr_bitmap(void)
862 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
865 static inline bool cpu_has_vmx_tpr_shadow(void)
867 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
870 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
872 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
875 static inline bool cpu_has_secondary_exec_ctrls(void)
877 return vmcs_config.cpu_based_exec_ctrl &
878 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
881 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
883 return vmcs_config.cpu_based_2nd_exec_ctrl &
884 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
887 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
889 return vmcs_config.cpu_based_2nd_exec_ctrl &
890 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
893 static inline bool cpu_has_vmx_apic_register_virt(void)
895 return vmcs_config.cpu_based_2nd_exec_ctrl &
896 SECONDARY_EXEC_APIC_REGISTER_VIRT;
899 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
901 return vmcs_config.cpu_based_2nd_exec_ctrl &
902 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
905 static inline bool cpu_has_vmx_posted_intr(void)
907 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
910 static inline bool cpu_has_vmx_apicv(void)
912 return cpu_has_vmx_apic_register_virt() &&
913 cpu_has_vmx_virtual_intr_delivery() &&
914 cpu_has_vmx_posted_intr();
917 static inline bool cpu_has_vmx_flexpriority(void)
919 return cpu_has_vmx_tpr_shadow() &&
920 cpu_has_vmx_virtualize_apic_accesses();
923 static inline bool cpu_has_vmx_ept_execute_only(void)
925 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
928 static inline bool cpu_has_vmx_eptp_uncacheable(void)
930 return vmx_capability.ept & VMX_EPTP_UC_BIT;
933 static inline bool cpu_has_vmx_eptp_writeback(void)
935 return vmx_capability.ept & VMX_EPTP_WB_BIT;
938 static inline bool cpu_has_vmx_ept_2m_page(void)
940 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
943 static inline bool cpu_has_vmx_ept_1g_page(void)
945 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
948 static inline bool cpu_has_vmx_ept_4levels(void)
950 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
953 static inline bool cpu_has_vmx_ept_ad_bits(void)
955 return vmx_capability.ept & VMX_EPT_AD_BIT;
958 static inline bool cpu_has_vmx_invept_context(void)
960 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
963 static inline bool cpu_has_vmx_invept_global(void)
965 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
968 static inline bool cpu_has_vmx_invvpid_single(void)
970 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
973 static inline bool cpu_has_vmx_invvpid_global(void)
975 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
978 static inline bool cpu_has_vmx_ept(void)
980 return vmcs_config.cpu_based_2nd_exec_ctrl &
981 SECONDARY_EXEC_ENABLE_EPT;
984 static inline bool cpu_has_vmx_unrestricted_guest(void)
986 return vmcs_config.cpu_based_2nd_exec_ctrl &
987 SECONDARY_EXEC_UNRESTRICTED_GUEST;
990 static inline bool cpu_has_vmx_ple(void)
992 return vmcs_config.cpu_based_2nd_exec_ctrl &
993 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
996 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
998 return flexpriority_enabled && irqchip_in_kernel(kvm);
1001 static inline bool cpu_has_vmx_vpid(void)
1003 return vmcs_config.cpu_based_2nd_exec_ctrl &
1004 SECONDARY_EXEC_ENABLE_VPID;
1007 static inline bool cpu_has_vmx_rdtscp(void)
1009 return vmcs_config.cpu_based_2nd_exec_ctrl &
1010 SECONDARY_EXEC_RDTSCP;
1013 static inline bool cpu_has_vmx_invpcid(void)
1015 return vmcs_config.cpu_based_2nd_exec_ctrl &
1016 SECONDARY_EXEC_ENABLE_INVPCID;
1019 static inline bool cpu_has_virtual_nmis(void)
1021 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1024 static inline bool cpu_has_vmx_wbinvd_exit(void)
1026 return vmcs_config.cpu_based_2nd_exec_ctrl &
1027 SECONDARY_EXEC_WBINVD_EXITING;
1030 static inline bool cpu_has_vmx_shadow_vmcs(void)
1033 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1034 /* check if the cpu supports writing r/o exit information fields */
1035 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1038 return vmcs_config.cpu_based_2nd_exec_ctrl &
1039 SECONDARY_EXEC_SHADOW_VMCS;
1042 static inline bool report_flexpriority(void)
1044 return flexpriority_enabled;
1047 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1049 return vmcs12->cpu_based_vm_exec_control & bit;
1052 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1054 return (vmcs12->cpu_based_vm_exec_control &
1055 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1056 (vmcs12->secondary_vm_exec_control & bit);
1059 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1061 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1064 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1066 return vmcs12->pin_based_vm_exec_control &
1067 PIN_BASED_VMX_PREEMPTION_TIMER;
1070 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1072 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1075 static inline bool is_exception(u32 intr_info)
1077 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1078 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1081 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1083 unsigned long exit_qualification);
1084 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1085 struct vmcs12 *vmcs12,
1086 u32 reason, unsigned long qualification);
1088 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1092 for (i = 0; i < vmx->nmsrs; ++i)
1093 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1098 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1104 } operand = { vpid, 0, gva };
1106 asm volatile (__ex(ASM_VMX_INVVPID)
1107 /* CF==1 or ZF==1 --> rc = -1 */
1108 "; ja 1f ; ud2 ; 1:"
1109 : : "a"(&operand), "c"(ext) : "cc", "memory");
1112 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1116 } operand = {eptp, gpa};
1118 asm volatile (__ex(ASM_VMX_INVEPT)
1119 /* CF==1 or ZF==1 --> rc = -1 */
1120 "; ja 1f ; ud2 ; 1:\n"
1121 : : "a" (&operand), "c" (ext) : "cc", "memory");
1124 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1128 i = __find_msr_index(vmx, msr);
1130 return &vmx->guest_msrs[i];
1134 static void vmcs_clear(struct vmcs *vmcs)
1136 u64 phys_addr = __pa(vmcs);
1139 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1140 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1143 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1147 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1149 vmcs_clear(loaded_vmcs->vmcs);
1150 loaded_vmcs->cpu = -1;
1151 loaded_vmcs->launched = 0;
1154 static void vmcs_load(struct vmcs *vmcs)
1156 u64 phys_addr = __pa(vmcs);
1159 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1160 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1163 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1169 * This bitmap is used to indicate whether the vmclear
1170 * operation is enabled on all cpus. All disabled by
1173 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1175 static inline void crash_enable_local_vmclear(int cpu)
1177 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1180 static inline void crash_disable_local_vmclear(int cpu)
1182 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1185 static inline int crash_local_vmclear_enabled(int cpu)
1187 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1190 static void crash_vmclear_local_loaded_vmcss(void)
1192 int cpu = raw_smp_processor_id();
1193 struct loaded_vmcs *v;
1195 if (!crash_local_vmclear_enabled(cpu))
1198 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1199 loaded_vmcss_on_cpu_link)
1200 vmcs_clear(v->vmcs);
1203 static inline void crash_enable_local_vmclear(int cpu) { }
1204 static inline void crash_disable_local_vmclear(int cpu) { }
1205 #endif /* CONFIG_KEXEC */
1207 static void __loaded_vmcs_clear(void *arg)
1209 struct loaded_vmcs *loaded_vmcs = arg;
1210 int cpu = raw_smp_processor_id();
1212 if (loaded_vmcs->cpu != cpu)
1213 return; /* vcpu migration can race with cpu offline */
1214 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1215 per_cpu(current_vmcs, cpu) = NULL;
1216 crash_disable_local_vmclear(cpu);
1217 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1220 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1221 * is before setting loaded_vmcs->vcpu to -1 which is done in
1222 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1223 * then adds the vmcs into percpu list before it is deleted.
1227 loaded_vmcs_init(loaded_vmcs);
1228 crash_enable_local_vmclear(cpu);
1231 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1233 int cpu = loaded_vmcs->cpu;
1236 smp_call_function_single(cpu,
1237 __loaded_vmcs_clear, loaded_vmcs, 1);
1240 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1245 if (cpu_has_vmx_invvpid_single())
1246 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1249 static inline void vpid_sync_vcpu_global(void)
1251 if (cpu_has_vmx_invvpid_global())
1252 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1255 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
1257 if (cpu_has_vmx_invvpid_single())
1258 vpid_sync_vcpu_single(vmx);
1260 vpid_sync_vcpu_global();
1263 static inline void ept_sync_global(void)
1265 if (cpu_has_vmx_invept_global())
1266 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1269 static inline void ept_sync_context(u64 eptp)
1272 if (cpu_has_vmx_invept_context())
1273 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1279 static __always_inline unsigned long vmcs_readl(unsigned long field)
1281 unsigned long value;
1283 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1284 : "=a"(value) : "d"(field) : "cc");
1288 static __always_inline u16 vmcs_read16(unsigned long field)
1290 return vmcs_readl(field);
1293 static __always_inline u32 vmcs_read32(unsigned long field)
1295 return vmcs_readl(field);
1298 static __always_inline u64 vmcs_read64(unsigned long field)
1300 #ifdef CONFIG_X86_64
1301 return vmcs_readl(field);
1303 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
1307 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1309 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1310 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1314 static void vmcs_writel(unsigned long field, unsigned long value)
1318 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1319 : "=q"(error) : "a"(value), "d"(field) : "cc");
1320 if (unlikely(error))
1321 vmwrite_error(field, value);
1324 static void vmcs_write16(unsigned long field, u16 value)
1326 vmcs_writel(field, value);
1329 static void vmcs_write32(unsigned long field, u32 value)
1331 vmcs_writel(field, value);
1334 static void vmcs_write64(unsigned long field, u64 value)
1336 vmcs_writel(field, value);
1337 #ifndef CONFIG_X86_64
1339 vmcs_writel(field+1, value >> 32);
1343 static void vmcs_clear_bits(unsigned long field, u32 mask)
1345 vmcs_writel(field, vmcs_readl(field) & ~mask);
1348 static void vmcs_set_bits(unsigned long field, u32 mask)
1350 vmcs_writel(field, vmcs_readl(field) | mask);
1353 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1355 vmcs_write32(VM_ENTRY_CONTROLS, val);
1356 vmx->vm_entry_controls_shadow = val;
1359 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1361 if (vmx->vm_entry_controls_shadow != val)
1362 vm_entry_controls_init(vmx, val);
1365 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1367 return vmx->vm_entry_controls_shadow;
1371 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1373 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1376 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1378 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1381 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1383 vmcs_write32(VM_EXIT_CONTROLS, val);
1384 vmx->vm_exit_controls_shadow = val;
1387 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1389 if (vmx->vm_exit_controls_shadow != val)
1390 vm_exit_controls_init(vmx, val);
1393 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1395 return vmx->vm_exit_controls_shadow;
1399 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1401 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1404 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1406 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1409 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1411 vmx->segment_cache.bitmask = 0;
1414 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1418 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1420 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1421 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1422 vmx->segment_cache.bitmask = 0;
1424 ret = vmx->segment_cache.bitmask & mask;
1425 vmx->segment_cache.bitmask |= mask;
1429 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1431 u16 *p = &vmx->segment_cache.seg[seg].selector;
1433 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1434 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1438 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1440 ulong *p = &vmx->segment_cache.seg[seg].base;
1442 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1443 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1447 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1449 u32 *p = &vmx->segment_cache.seg[seg].limit;
1451 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1452 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1456 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1458 u32 *p = &vmx->segment_cache.seg[seg].ar;
1460 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1461 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1465 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1469 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1470 (1u << NM_VECTOR) | (1u << DB_VECTOR);
1471 if ((vcpu->guest_debug &
1472 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1473 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1474 eb |= 1u << BP_VECTOR;
1475 if (to_vmx(vcpu)->rmode.vm86_active)
1478 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1479 if (vcpu->fpu_active)
1480 eb &= ~(1u << NM_VECTOR);
1482 /* When we are running a nested L2 guest and L1 specified for it a
1483 * certain exception bitmap, we must trap the same exceptions and pass
1484 * them to L1. When running L2, we will only handle the exceptions
1485 * specified above if L1 did not want them.
1487 if (is_guest_mode(vcpu))
1488 eb |= get_vmcs12(vcpu)->exception_bitmap;
1490 vmcs_write32(EXCEPTION_BITMAP, eb);
1493 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1494 unsigned long entry, unsigned long exit)
1496 vm_entry_controls_clearbit(vmx, entry);
1497 vm_exit_controls_clearbit(vmx, exit);
1500 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1503 struct msr_autoload *m = &vmx->msr_autoload;
1507 if (cpu_has_load_ia32_efer) {
1508 clear_atomic_switch_msr_special(vmx,
1509 VM_ENTRY_LOAD_IA32_EFER,
1510 VM_EXIT_LOAD_IA32_EFER);
1514 case MSR_CORE_PERF_GLOBAL_CTRL:
1515 if (cpu_has_load_perf_global_ctrl) {
1516 clear_atomic_switch_msr_special(vmx,
1517 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1518 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1524 for (i = 0; i < m->nr; ++i)
1525 if (m->guest[i].index == msr)
1531 m->guest[i] = m->guest[m->nr];
1532 m->host[i] = m->host[m->nr];
1533 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1534 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1537 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1538 unsigned long entry, unsigned long exit,
1539 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1540 u64 guest_val, u64 host_val)
1542 vmcs_write64(guest_val_vmcs, guest_val);
1543 vmcs_write64(host_val_vmcs, host_val);
1544 vm_entry_controls_setbit(vmx, entry);
1545 vm_exit_controls_setbit(vmx, exit);
1548 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1549 u64 guest_val, u64 host_val)
1552 struct msr_autoload *m = &vmx->msr_autoload;
1556 if (cpu_has_load_ia32_efer) {
1557 add_atomic_switch_msr_special(vmx,
1558 VM_ENTRY_LOAD_IA32_EFER,
1559 VM_EXIT_LOAD_IA32_EFER,
1562 guest_val, host_val);
1566 case MSR_CORE_PERF_GLOBAL_CTRL:
1567 if (cpu_has_load_perf_global_ctrl) {
1568 add_atomic_switch_msr_special(vmx,
1569 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1570 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1571 GUEST_IA32_PERF_GLOBAL_CTRL,
1572 HOST_IA32_PERF_GLOBAL_CTRL,
1573 guest_val, host_val);
1579 for (i = 0; i < m->nr; ++i)
1580 if (m->guest[i].index == msr)
1583 if (i == NR_AUTOLOAD_MSRS) {
1584 printk_once(KERN_WARNING "Not enough msr switch entries. "
1585 "Can't add msr %x\n", msr);
1587 } else if (i == m->nr) {
1589 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1590 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1593 m->guest[i].index = msr;
1594 m->guest[i].value = guest_val;
1595 m->host[i].index = msr;
1596 m->host[i].value = host_val;
1599 static void reload_tss(void)
1602 * VT restores TR but not its size. Useless.
1604 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1605 struct desc_struct *descs;
1607 descs = (void *)gdt->address;
1608 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
1612 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1617 guest_efer = vmx->vcpu.arch.efer;
1620 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1623 ignore_bits = EFER_NX | EFER_SCE;
1624 #ifdef CONFIG_X86_64
1625 ignore_bits |= EFER_LMA | EFER_LME;
1626 /* SCE is meaningful only in long mode on Intel */
1627 if (guest_efer & EFER_LMA)
1628 ignore_bits &= ~(u64)EFER_SCE;
1630 guest_efer &= ~ignore_bits;
1631 guest_efer |= host_efer & ignore_bits;
1632 vmx->guest_msrs[efer_offset].data = guest_efer;
1633 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1635 clear_atomic_switch_msr(vmx, MSR_EFER);
1636 /* On ept, can't emulate nx, and must switch nx atomically */
1637 if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
1638 guest_efer = vmx->vcpu.arch.efer;
1639 if (!(guest_efer & EFER_LMA))
1640 guest_efer &= ~EFER_LME;
1641 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
1648 static unsigned long segment_base(u16 selector)
1650 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1651 struct desc_struct *d;
1652 unsigned long table_base;
1655 if (!(selector & ~3))
1658 table_base = gdt->address;
1660 if (selector & 4) { /* from ldt */
1661 u16 ldt_selector = kvm_read_ldt();
1663 if (!(ldt_selector & ~3))
1666 table_base = segment_base(ldt_selector);
1668 d = (struct desc_struct *)(table_base + (selector & ~7));
1669 v = get_desc_base(d);
1670 #ifdef CONFIG_X86_64
1671 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
1672 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
1677 static inline unsigned long kvm_read_tr_base(void)
1680 asm("str %0" : "=g"(tr));
1681 return segment_base(tr);
1684 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1686 struct vcpu_vmx *vmx = to_vmx(vcpu);
1689 if (vmx->host_state.loaded)
1692 vmx->host_state.loaded = 1;
1694 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1695 * allow segment selectors with cpl > 0 or ti == 1.
1697 vmx->host_state.ldt_sel = kvm_read_ldt();
1698 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1699 savesegment(fs, vmx->host_state.fs_sel);
1700 if (!(vmx->host_state.fs_sel & 7)) {
1701 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1702 vmx->host_state.fs_reload_needed = 0;
1704 vmcs_write16(HOST_FS_SELECTOR, 0);
1705 vmx->host_state.fs_reload_needed = 1;
1707 savesegment(gs, vmx->host_state.gs_sel);
1708 if (!(vmx->host_state.gs_sel & 7))
1709 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1711 vmcs_write16(HOST_GS_SELECTOR, 0);
1712 vmx->host_state.gs_ldt_reload_needed = 1;
1715 #ifdef CONFIG_X86_64
1716 savesegment(ds, vmx->host_state.ds_sel);
1717 savesegment(es, vmx->host_state.es_sel);
1720 #ifdef CONFIG_X86_64
1721 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1722 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1724 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1725 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1728 #ifdef CONFIG_X86_64
1729 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1730 if (is_long_mode(&vmx->vcpu))
1731 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1733 if (boot_cpu_has(X86_FEATURE_MPX))
1734 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1735 for (i = 0; i < vmx->save_nmsrs; ++i)
1736 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1737 vmx->guest_msrs[i].data,
1738 vmx->guest_msrs[i].mask);
1741 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1743 if (!vmx->host_state.loaded)
1746 ++vmx->vcpu.stat.host_state_reload;
1747 vmx->host_state.loaded = 0;
1748 #ifdef CONFIG_X86_64
1749 if (is_long_mode(&vmx->vcpu))
1750 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1752 if (vmx->host_state.gs_ldt_reload_needed) {
1753 kvm_load_ldt(vmx->host_state.ldt_sel);
1754 #ifdef CONFIG_X86_64
1755 load_gs_index(vmx->host_state.gs_sel);
1757 loadsegment(gs, vmx->host_state.gs_sel);
1760 if (vmx->host_state.fs_reload_needed)
1761 loadsegment(fs, vmx->host_state.fs_sel);
1762 #ifdef CONFIG_X86_64
1763 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1764 loadsegment(ds, vmx->host_state.ds_sel);
1765 loadsegment(es, vmx->host_state.es_sel);
1769 #ifdef CONFIG_X86_64
1770 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1772 if (vmx->host_state.msr_host_bndcfgs)
1773 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1775 * If the FPU is not active (through the host task or
1776 * the guest vcpu), then restore the cr0.TS bit.
1778 if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
1780 load_gdt(&__get_cpu_var(host_gdt));
1783 static void vmx_load_host_state(struct vcpu_vmx *vmx)
1786 __vmx_load_host_state(vmx);
1791 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1792 * vcpu mutex is already taken.
1794 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1796 struct vcpu_vmx *vmx = to_vmx(vcpu);
1797 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1800 kvm_cpu_vmxon(phys_addr);
1801 else if (vmx->loaded_vmcs->cpu != cpu)
1802 loaded_vmcs_clear(vmx->loaded_vmcs);
1804 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1805 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1806 vmcs_load(vmx->loaded_vmcs->vmcs);
1809 if (vmx->loaded_vmcs->cpu != cpu) {
1810 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1811 unsigned long sysenter_esp;
1813 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1814 local_irq_disable();
1815 crash_disable_local_vmclear(cpu);
1818 * Read loaded_vmcs->cpu should be before fetching
1819 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1820 * See the comments in __loaded_vmcs_clear().
1824 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1825 &per_cpu(loaded_vmcss_on_cpu, cpu));
1826 crash_enable_local_vmclear(cpu);
1830 * Linux uses per-cpu TSS and GDT, so set these when switching
1833 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1834 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
1836 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1837 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1838 vmx->loaded_vmcs->cpu = cpu;
1842 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1844 __vmx_load_host_state(to_vmx(vcpu));
1845 if (!vmm_exclusive) {
1846 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1852 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1856 if (vcpu->fpu_active)
1858 vcpu->fpu_active = 1;
1859 cr0 = vmcs_readl(GUEST_CR0);
1860 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1861 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1862 vmcs_writel(GUEST_CR0, cr0);
1863 update_exception_bitmap(vcpu);
1864 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1865 if (is_guest_mode(vcpu))
1866 vcpu->arch.cr0_guest_owned_bits &=
1867 ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1868 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1871 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1874 * Return the cr0 value that a nested guest would read. This is a combination
1875 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1876 * its hypervisor (cr0_read_shadow).
1878 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1880 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1881 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1883 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1885 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1886 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1889 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1891 /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1892 * set this *before* calling this function.
1894 vmx_decache_cr0_guest_bits(vcpu);
1895 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1896 update_exception_bitmap(vcpu);
1897 vcpu->arch.cr0_guest_owned_bits = 0;
1898 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1899 if (is_guest_mode(vcpu)) {
1901 * L1's specified read shadow might not contain the TS bit,
1902 * so now that we turned on shadowing of this bit, we need to
1903 * set this bit of the shadow. Like in nested_vmx_run we need
1904 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1905 * up-to-date here because we just decached cr0.TS (and we'll
1906 * only update vmcs12->guest_cr0 on nested exit).
1908 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1909 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1910 (vcpu->arch.cr0 & X86_CR0_TS);
1911 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1913 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1916 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1918 unsigned long rflags, save_rflags;
1920 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1921 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1922 rflags = vmcs_readl(GUEST_RFLAGS);
1923 if (to_vmx(vcpu)->rmode.vm86_active) {
1924 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1925 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1926 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1928 to_vmx(vcpu)->rflags = rflags;
1930 return to_vmx(vcpu)->rflags;
1933 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1935 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1936 to_vmx(vcpu)->rflags = rflags;
1937 if (to_vmx(vcpu)->rmode.vm86_active) {
1938 to_vmx(vcpu)->rmode.save_rflags = rflags;
1939 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1941 vmcs_writel(GUEST_RFLAGS, rflags);
1944 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1946 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1949 if (interruptibility & GUEST_INTR_STATE_STI)
1950 ret |= KVM_X86_SHADOW_INT_STI;
1951 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1952 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1957 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1959 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1960 u32 interruptibility = interruptibility_old;
1962 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1964 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1965 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1966 else if (mask & KVM_X86_SHADOW_INT_STI)
1967 interruptibility |= GUEST_INTR_STATE_STI;
1969 if ((interruptibility != interruptibility_old))
1970 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1973 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1977 rip = kvm_rip_read(vcpu);
1978 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1979 kvm_rip_write(vcpu, rip);
1981 /* skipping an emulated instruction also counts */
1982 vmx_set_interrupt_shadow(vcpu, 0);
1986 * KVM wants to inject page-faults which it got to the guest. This function
1987 * checks whether in a nested guest, we need to inject them to L1 or L2.
1989 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
1991 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1993 if (!(vmcs12->exception_bitmap & (1u << nr)))
1996 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
1997 vmcs_read32(VM_EXIT_INTR_INFO),
1998 vmcs_readl(EXIT_QUALIFICATION));
2002 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
2003 bool has_error_code, u32 error_code,
2006 struct vcpu_vmx *vmx = to_vmx(vcpu);
2007 u32 intr_info = nr | INTR_INFO_VALID_MASK;
2009 if (!reinject && is_guest_mode(vcpu) &&
2010 nested_vmx_check_exception(vcpu, nr))
2013 if (has_error_code) {
2014 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2015 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2018 if (vmx->rmode.vm86_active) {
2020 if (kvm_exception_is_soft(nr))
2021 inc_eip = vcpu->arch.event_exit_inst_len;
2022 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2023 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2027 if (kvm_exception_is_soft(nr)) {
2028 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2029 vmx->vcpu.arch.event_exit_inst_len);
2030 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2032 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2034 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2037 static bool vmx_rdtscp_supported(void)
2039 return cpu_has_vmx_rdtscp();
2042 static bool vmx_invpcid_supported(void)
2044 return cpu_has_vmx_invpcid() && enable_ept;
2048 * Swap MSR entry in host/guest MSR entry array.
2050 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2052 struct shared_msr_entry tmp;
2054 tmp = vmx->guest_msrs[to];
2055 vmx->guest_msrs[to] = vmx->guest_msrs[from];
2056 vmx->guest_msrs[from] = tmp;
2059 static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2061 unsigned long *msr_bitmap;
2063 if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
2064 if (is_long_mode(vcpu))
2065 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2067 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2069 if (is_long_mode(vcpu))
2070 msr_bitmap = vmx_msr_bitmap_longmode;
2072 msr_bitmap = vmx_msr_bitmap_legacy;
2075 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2079 * Set up the vmcs to automatically save and restore system
2080 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2081 * mode, as fiddling with msrs is very expensive.
2083 static void setup_msrs(struct vcpu_vmx *vmx)
2085 int save_nmsrs, index;
2088 #ifdef CONFIG_X86_64
2089 if (is_long_mode(&vmx->vcpu)) {
2090 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2092 move_msr_up(vmx, index, save_nmsrs++);
2093 index = __find_msr_index(vmx, MSR_LSTAR);
2095 move_msr_up(vmx, index, save_nmsrs++);
2096 index = __find_msr_index(vmx, MSR_CSTAR);
2098 move_msr_up(vmx, index, save_nmsrs++);
2099 index = __find_msr_index(vmx, MSR_TSC_AUX);
2100 if (index >= 0 && vmx->rdtscp_enabled)
2101 move_msr_up(vmx, index, save_nmsrs++);
2103 * MSR_STAR is only needed on long mode guests, and only
2104 * if efer.sce is enabled.
2106 index = __find_msr_index(vmx, MSR_STAR);
2107 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2108 move_msr_up(vmx, index, save_nmsrs++);
2111 index = __find_msr_index(vmx, MSR_EFER);
2112 if (index >= 0 && update_transition_efer(vmx, index))
2113 move_msr_up(vmx, index, save_nmsrs++);
2115 vmx->save_nmsrs = save_nmsrs;
2117 if (cpu_has_vmx_msr_bitmap())
2118 vmx_set_msr_bitmap(&vmx->vcpu);
2122 * reads and returns guest's timestamp counter "register"
2123 * guest_tsc = host_tsc + tsc_offset -- 21.3
2125 static u64 guest_read_tsc(void)
2127 u64 host_tsc, tsc_offset;
2130 tsc_offset = vmcs_read64(TSC_OFFSET);
2131 return host_tsc + tsc_offset;
2135 * Like guest_read_tsc, but always returns L1's notion of the timestamp
2136 * counter, even if a nested guest (L2) is currently running.
2138 u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2142 tsc_offset = is_guest_mode(vcpu) ?
2143 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
2144 vmcs_read64(TSC_OFFSET);
2145 return host_tsc + tsc_offset;
2149 * Engage any workarounds for mis-matched TSC rates. Currently limited to
2150 * software catchup for faster rates on slower CPUs.
2152 static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2157 if (user_tsc_khz > tsc_khz) {
2158 vcpu->arch.tsc_catchup = 1;
2159 vcpu->arch.tsc_always_catchup = 1;
2161 WARN(1, "user requested TSC rate below hardware speed\n");
2164 static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
2166 return vmcs_read64(TSC_OFFSET);
2170 * writes 'offset' into guest's timestamp counter offset register
2172 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2174 if (is_guest_mode(vcpu)) {
2176 * We're here if L1 chose not to trap WRMSR to TSC. According
2177 * to the spec, this should set L1's TSC; The offset that L1
2178 * set for L2 remains unchanged, and still needs to be added
2179 * to the newly set TSC to get L2's TSC.
2181 struct vmcs12 *vmcs12;
2182 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
2183 /* recalculate vmcs02.TSC_OFFSET: */
2184 vmcs12 = get_vmcs12(vcpu);
2185 vmcs_write64(TSC_OFFSET, offset +
2186 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2187 vmcs12->tsc_offset : 0));
2189 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2190 vmcs_read64(TSC_OFFSET), offset);
2191 vmcs_write64(TSC_OFFSET, offset);
2195 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2197 u64 offset = vmcs_read64(TSC_OFFSET);
2199 vmcs_write64(TSC_OFFSET, offset + adjustment);
2200 if (is_guest_mode(vcpu)) {
2201 /* Even when running L2, the adjustment needs to apply to L1 */
2202 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2204 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2205 offset + adjustment);
2208 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2210 return target_tsc - native_read_tsc();
2213 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2215 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2216 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2220 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2221 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2222 * all guests if the "nested" module option is off, and can also be disabled
2223 * for a single guest by disabling its VMX cpuid bit.
2225 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2227 return nested && guest_cpuid_has_vmx(vcpu);
2231 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2232 * returned for the various VMX controls MSRs when nested VMX is enabled.
2233 * The same values should also be used to verify that vmcs12 control fields are
2234 * valid during nested entry from L1 to L2.
2235 * Each of these control msrs has a low and high 32-bit half: A low bit is on
2236 * if the corresponding bit in the (32-bit) control field *must* be on, and a
2237 * bit in the high half is on if the corresponding bit in the control field
2238 * may be on. See also vmx_control_verify().
2239 * TODO: allow these variables to be modified (downgraded) by module options
2242 static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
2243 static u32 nested_vmx_true_procbased_ctls_low;
2244 static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2245 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2246 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2247 static u32 nested_vmx_true_exit_ctls_low;
2248 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2249 static u32 nested_vmx_true_entry_ctls_low;
2250 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2251 static u32 nested_vmx_ept_caps;
2252 static __init void nested_vmx_setup_ctls_msrs(void)
2255 * Note that as a general rule, the high half of the MSRs (bits in
2256 * the control fields which may be 1) should be initialized by the
2257 * intersection of the underlying hardware's MSR (i.e., features which
2258 * can be supported) and the list of features we want to expose -
2259 * because they are known to be properly supported in our code.
2260 * Also, usually, the low half of the MSRs (bits which must be 1) can
2261 * be set to 0, meaning that L1 may turn off any of these bits. The
2262 * reason is that if one of these bits is necessary, it will appear
2263 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2264 * fields of vmcs01 and vmcs02, will turn these bits off - and
2265 * nested_vmx_exit_handled() will not pass related exits to L1.
2266 * These rules have exceptions below.
2269 /* pin-based controls */
2270 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2271 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
2272 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2273 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2274 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
2275 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2276 PIN_BASED_VMX_PREEMPTION_TIMER;
2279 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2280 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2281 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2283 nested_vmx_exit_ctls_high &=
2284 #ifdef CONFIG_X86_64
2285 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2287 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2288 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2289 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2290 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2292 if (vmx_mpx_supported())
2293 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2295 /* We support free control of debug control saving. */
2296 nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
2297 ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2299 /* entry controls */
2300 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2301 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
2302 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2303 nested_vmx_entry_ctls_high &=
2304 #ifdef CONFIG_X86_64
2305 VM_ENTRY_IA32E_MODE |
2307 VM_ENTRY_LOAD_IA32_PAT;
2308 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2309 VM_ENTRY_LOAD_IA32_EFER);
2310 if (vmx_mpx_supported())
2311 nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2313 /* We support free control of debug control loading. */
2314 nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
2315 ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2317 /* cpu-based controls */
2318 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2319 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
2320 nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2321 nested_vmx_procbased_ctls_high &=
2322 CPU_BASED_VIRTUAL_INTR_PENDING |
2323 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2324 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2325 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2326 CPU_BASED_CR3_STORE_EXITING |
2327 #ifdef CONFIG_X86_64
2328 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2330 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2331 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2332 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2333 CPU_BASED_PAUSE_EXITING |
2334 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2336 * We can allow some features even when not supported by the
2337 * hardware. For example, L1 can specify an MSR bitmap - and we
2338 * can use it to avoid exits to L1 - even when L0 runs L2
2339 * without MSR bitmaps.
2341 nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2342 CPU_BASED_USE_MSR_BITMAPS;
2344 /* We support free control of CR3 access interception. */
2345 nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
2346 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2348 /* secondary cpu-based controls */
2349 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2350 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
2351 nested_vmx_secondary_ctls_low = 0;
2352 nested_vmx_secondary_ctls_high &=
2353 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2354 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2355 SECONDARY_EXEC_WBINVD_EXITING;
2358 /* nested EPT: emulate EPT also to L1 */
2359 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
2360 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2361 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2363 nested_vmx_ept_caps &= vmx_capability.ept;
2365 * For nested guests, we don't do anything specific
2366 * for single context invalidation. Hence, only advertise
2367 * support for global context invalidation.
2369 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2371 nested_vmx_ept_caps = 0;
2373 /* miscellaneous data */
2374 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2375 nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2376 nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2377 VMX_MISC_ACTIVITY_HLT;
2378 nested_vmx_misc_high = 0;
2381 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2384 * Bits 0 in high must be 0, and bits 1 in low must be 1.
2386 return ((control & high) | low) == control;
2389 static inline u64 vmx_control_msr(u32 low, u32 high)
2391 return low | ((u64)high << 32);
2394 /* Returns 0 on success, non-0 otherwise. */
2395 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2397 switch (msr_index) {
2398 case MSR_IA32_VMX_BASIC:
2400 * This MSR reports some information about VMX support. We
2401 * should return information about the VMX we emulate for the
2402 * guest, and the VMCS structure we give it - not about the
2403 * VMX support of the underlying hardware.
2405 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
2406 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2407 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2409 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2410 case MSR_IA32_VMX_PINBASED_CTLS:
2411 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
2412 nested_vmx_pinbased_ctls_high);
2414 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2415 *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
2416 nested_vmx_procbased_ctls_high);
2418 case MSR_IA32_VMX_PROCBASED_CTLS:
2419 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
2420 nested_vmx_procbased_ctls_high);
2422 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2423 *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
2424 nested_vmx_exit_ctls_high);
2426 case MSR_IA32_VMX_EXIT_CTLS:
2427 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
2428 nested_vmx_exit_ctls_high);
2430 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2431 *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
2432 nested_vmx_entry_ctls_high);
2434 case MSR_IA32_VMX_ENTRY_CTLS:
2435 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
2436 nested_vmx_entry_ctls_high);
2438 case MSR_IA32_VMX_MISC:
2439 *pdata = vmx_control_msr(nested_vmx_misc_low,
2440 nested_vmx_misc_high);
2443 * These MSRs specify bits which the guest must keep fixed (on or off)
2444 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2445 * We picked the standard core2 setting.
2447 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2448 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE
2449 case MSR_IA32_VMX_CR0_FIXED0:
2450 *pdata = VMXON_CR0_ALWAYSON;
2452 case MSR_IA32_VMX_CR0_FIXED1:
2455 case MSR_IA32_VMX_CR4_FIXED0:
2456 *pdata = VMXON_CR4_ALWAYSON;
2458 case MSR_IA32_VMX_CR4_FIXED1:
2461 case MSR_IA32_VMX_VMCS_ENUM:
2462 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2464 case MSR_IA32_VMX_PROCBASED_CTLS2:
2465 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
2466 nested_vmx_secondary_ctls_high);
2468 case MSR_IA32_VMX_EPT_VPID_CAP:
2469 /* Currently, no nested vpid support */
2470 *pdata = nested_vmx_ept_caps;
2480 * Reads an msr value (of 'msr_index') into 'pdata'.
2481 * Returns 0 on success, non-0 otherwise.
2482 * Assumes vcpu_load() was already called.
2484 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2487 struct shared_msr_entry *msr;
2490 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2494 switch (msr_index) {
2495 #ifdef CONFIG_X86_64
2497 data = vmcs_readl(GUEST_FS_BASE);
2500 data = vmcs_readl(GUEST_GS_BASE);
2502 case MSR_KERNEL_GS_BASE:
2503 vmx_load_host_state(to_vmx(vcpu));
2504 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2508 return kvm_get_msr_common(vcpu, msr_index, pdata);
2510 data = guest_read_tsc();
2512 case MSR_IA32_SYSENTER_CS:
2513 data = vmcs_read32(GUEST_SYSENTER_CS);
2515 case MSR_IA32_SYSENTER_EIP:
2516 data = vmcs_readl(GUEST_SYSENTER_EIP);
2518 case MSR_IA32_SYSENTER_ESP:
2519 data = vmcs_readl(GUEST_SYSENTER_ESP);
2521 case MSR_IA32_BNDCFGS:
2522 if (!vmx_mpx_supported())
2524 data = vmcs_read64(GUEST_BNDCFGS);
2526 case MSR_IA32_FEATURE_CONTROL:
2527 if (!nested_vmx_allowed(vcpu))
2529 data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2531 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2532 if (!nested_vmx_allowed(vcpu))
2534 return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2536 if (!to_vmx(vcpu)->rdtscp_enabled)
2538 /* Otherwise falls through */
2540 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2545 return kvm_get_msr_common(vcpu, msr_index, pdata);
2552 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2555 * Writes msr value into into the appropriate "register".
2556 * Returns 0 on success, non-0 otherwise.
2557 * Assumes vcpu_load() was already called.
2559 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2561 struct vcpu_vmx *vmx = to_vmx(vcpu);
2562 struct shared_msr_entry *msr;
2564 u32 msr_index = msr_info->index;
2565 u64 data = msr_info->data;
2567 switch (msr_index) {
2569 ret = kvm_set_msr_common(vcpu, msr_info);
2571 #ifdef CONFIG_X86_64
2573 vmx_segment_cache_clear(vmx);
2574 vmcs_writel(GUEST_FS_BASE, data);
2577 vmx_segment_cache_clear(vmx);
2578 vmcs_writel(GUEST_GS_BASE, data);
2580 case MSR_KERNEL_GS_BASE:
2581 vmx_load_host_state(vmx);
2582 vmx->msr_guest_kernel_gs_base = data;
2585 case MSR_IA32_SYSENTER_CS:
2586 vmcs_write32(GUEST_SYSENTER_CS, data);
2588 case MSR_IA32_SYSENTER_EIP:
2589 vmcs_writel(GUEST_SYSENTER_EIP, data);
2591 case MSR_IA32_SYSENTER_ESP:
2592 vmcs_writel(GUEST_SYSENTER_ESP, data);
2594 case MSR_IA32_BNDCFGS:
2595 if (!vmx_mpx_supported())
2597 vmcs_write64(GUEST_BNDCFGS, data);
2600 kvm_write_tsc(vcpu, msr_info);
2602 case MSR_IA32_CR_PAT:
2603 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2604 vmcs_write64(GUEST_IA32_PAT, data);
2605 vcpu->arch.pat = data;
2608 ret = kvm_set_msr_common(vcpu, msr_info);
2610 case MSR_IA32_TSC_ADJUST:
2611 ret = kvm_set_msr_common(vcpu, msr_info);
2613 case MSR_IA32_FEATURE_CONTROL:
2614 if (!nested_vmx_allowed(vcpu) ||
2615 (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2616 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2618 vmx->nested.msr_ia32_feature_control = data;
2619 if (msr_info->host_initiated && data == 0)
2620 vmx_leave_nested(vcpu);
2622 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2623 return 1; /* they are read-only */
2625 if (!vmx->rdtscp_enabled)
2627 /* Check reserved bit, higher 32 bits should be zero */
2628 if ((data >> 32) != 0)
2630 /* Otherwise falls through */
2632 msr = find_msr_entry(vmx, msr_index);
2635 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2637 kvm_set_shared_msr(msr->index, msr->data,
2643 ret = kvm_set_msr_common(vcpu, msr_info);
2649 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2651 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2654 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2657 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2659 case VCPU_EXREG_PDPTR:
2661 ept_save_pdptrs(vcpu);
2668 static __init int cpu_has_kvm_support(void)
2670 return cpu_has_vmx();
2673 static __init int vmx_disabled_by_bios(void)
2677 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2678 if (msr & FEATURE_CONTROL_LOCKED) {
2679 /* launched w/ TXT and VMX disabled */
2680 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2683 /* launched w/o TXT and VMX only enabled w/ TXT */
2684 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2685 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2686 && !tboot_enabled()) {
2687 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2688 "activate TXT before enabling KVM\n");
2691 /* launched w/o TXT and VMX disabled */
2692 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2693 && !tboot_enabled())
2700 static void kvm_cpu_vmxon(u64 addr)
2702 asm volatile (ASM_VMX_VMXON_RAX
2703 : : "a"(&addr), "m"(addr)
2707 static int hardware_enable(void *garbage)
2709 int cpu = raw_smp_processor_id();
2710 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2713 if (read_cr4() & X86_CR4_VMXE)
2716 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2719 * Now we can enable the vmclear operation in kdump
2720 * since the loaded_vmcss_on_cpu list on this cpu
2721 * has been initialized.
2723 * Though the cpu is not in VMX operation now, there
2724 * is no problem to enable the vmclear operation
2725 * for the loaded_vmcss_on_cpu list is empty!
2727 crash_enable_local_vmclear(cpu);
2729 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2731 test_bits = FEATURE_CONTROL_LOCKED;
2732 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2733 if (tboot_enabled())
2734 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2736 if ((old & test_bits) != test_bits) {
2737 /* enable and lock */
2738 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2740 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
2742 if (vmm_exclusive) {
2743 kvm_cpu_vmxon(phys_addr);
2747 native_store_gdt(&__get_cpu_var(host_gdt));
2752 static void vmclear_local_loaded_vmcss(void)
2754 int cpu = raw_smp_processor_id();
2755 struct loaded_vmcs *v, *n;
2757 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2758 loaded_vmcss_on_cpu_link)
2759 __loaded_vmcs_clear(v);
2763 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2766 static void kvm_cpu_vmxoff(void)
2768 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
2771 static void hardware_disable(void *garbage)
2773 if (vmm_exclusive) {
2774 vmclear_local_loaded_vmcss();
2777 write_cr4(read_cr4() & ~X86_CR4_VMXE);
2780 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2781 u32 msr, u32 *result)
2783 u32 vmx_msr_low, vmx_msr_high;
2784 u32 ctl = ctl_min | ctl_opt;
2786 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2788 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2789 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2791 /* Ensure minimum (required) set of control bits are supported. */
2799 static __init bool allow_1_setting(u32 msr, u32 ctl)
2801 u32 vmx_msr_low, vmx_msr_high;
2803 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2804 return vmx_msr_high & ctl;
2807 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2809 u32 vmx_msr_low, vmx_msr_high;
2810 u32 min, opt, min2, opt2;
2811 u32 _pin_based_exec_control = 0;
2812 u32 _cpu_based_exec_control = 0;
2813 u32 _cpu_based_2nd_exec_control = 0;
2814 u32 _vmexit_control = 0;
2815 u32 _vmentry_control = 0;
2817 min = CPU_BASED_HLT_EXITING |
2818 #ifdef CONFIG_X86_64
2819 CPU_BASED_CR8_LOAD_EXITING |
2820 CPU_BASED_CR8_STORE_EXITING |
2822 CPU_BASED_CR3_LOAD_EXITING |
2823 CPU_BASED_CR3_STORE_EXITING |
2824 CPU_BASED_USE_IO_BITMAPS |
2825 CPU_BASED_MOV_DR_EXITING |
2826 CPU_BASED_USE_TSC_OFFSETING |
2827 CPU_BASED_MWAIT_EXITING |
2828 CPU_BASED_MONITOR_EXITING |
2829 CPU_BASED_INVLPG_EXITING |
2830 CPU_BASED_RDPMC_EXITING;
2832 opt = CPU_BASED_TPR_SHADOW |
2833 CPU_BASED_USE_MSR_BITMAPS |
2834 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2835 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2836 &_cpu_based_exec_control) < 0)
2838 #ifdef CONFIG_X86_64
2839 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2840 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2841 ~CPU_BASED_CR8_STORE_EXITING;
2843 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2845 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2846 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2847 SECONDARY_EXEC_WBINVD_EXITING |
2848 SECONDARY_EXEC_ENABLE_VPID |
2849 SECONDARY_EXEC_ENABLE_EPT |
2850 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2851 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2852 SECONDARY_EXEC_RDTSCP |
2853 SECONDARY_EXEC_ENABLE_INVPCID |
2854 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2855 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2856 SECONDARY_EXEC_SHADOW_VMCS;
2857 if (adjust_vmx_controls(min2, opt2,
2858 MSR_IA32_VMX_PROCBASED_CTLS2,
2859 &_cpu_based_2nd_exec_control) < 0)
2862 #ifndef CONFIG_X86_64
2863 if (!(_cpu_based_2nd_exec_control &
2864 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2865 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2868 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2869 _cpu_based_2nd_exec_control &= ~(
2870 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2871 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2872 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2874 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2875 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2877 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2878 CPU_BASED_CR3_STORE_EXITING |
2879 CPU_BASED_INVLPG_EXITING);
2880 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
2881 vmx_capability.ept, vmx_capability.vpid);
2884 min = VM_EXIT_SAVE_DEBUG_CONTROLS;
2885 #ifdef CONFIG_X86_64
2886 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2888 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
2889 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
2890 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2891 &_vmexit_control) < 0)
2894 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2895 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
2896 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2897 &_pin_based_exec_control) < 0)
2900 if (!(_cpu_based_2nd_exec_control &
2901 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
2902 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
2903 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2905 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2906 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
2907 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2908 &_vmentry_control) < 0)
2911 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2913 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2914 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2917 #ifdef CONFIG_X86_64
2918 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2919 if (vmx_msr_high & (1u<<16))
2923 /* Require Write-Back (WB) memory type for VMCS accesses. */
2924 if (((vmx_msr_high >> 18) & 15) != 6)
2927 vmcs_conf->size = vmx_msr_high & 0x1fff;
2928 vmcs_conf->order = get_order(vmcs_config.size);
2929 vmcs_conf->revision_id = vmx_msr_low;
2931 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2932 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2933 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2934 vmcs_conf->vmexit_ctrl = _vmexit_control;
2935 vmcs_conf->vmentry_ctrl = _vmentry_control;
2937 cpu_has_load_ia32_efer =
2938 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2939 VM_ENTRY_LOAD_IA32_EFER)
2940 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2941 VM_EXIT_LOAD_IA32_EFER);
2943 cpu_has_load_perf_global_ctrl =
2944 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2945 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
2946 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2947 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2950 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
2951 * but due to arrata below it can't be used. Workaround is to use
2952 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2954 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
2959 * BC86,AAY89,BD102 (model 44)
2963 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
2964 switch (boot_cpu_data.x86_model) {
2970 cpu_has_load_perf_global_ctrl = false;
2971 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2972 "does not work properly. Using workaround\n");
2982 static struct vmcs *alloc_vmcs_cpu(int cpu)
2984 int node = cpu_to_node(cpu);
2988 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
2991 vmcs = page_address(pages);
2992 memset(vmcs, 0, vmcs_config.size);
2993 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
2997 static struct vmcs *alloc_vmcs(void)
2999 return alloc_vmcs_cpu(raw_smp_processor_id());
3002 static void free_vmcs(struct vmcs *vmcs)
3004 free_pages((unsigned long)vmcs, vmcs_config.order);
3008 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3010 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3012 if (!loaded_vmcs->vmcs)
3014 loaded_vmcs_clear(loaded_vmcs);
3015 free_vmcs(loaded_vmcs->vmcs);
3016 loaded_vmcs->vmcs = NULL;
3019 static void free_kvm_area(void)
3023 for_each_possible_cpu(cpu) {
3024 free_vmcs(per_cpu(vmxarea, cpu));
3025 per_cpu(vmxarea, cpu) = NULL;
3029 static void init_vmcs_shadow_fields(void)
3033 /* No checks for read only fields yet */
3035 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
3036 switch (shadow_read_write_fields[i]) {
3038 if (!vmx_mpx_supported())
3046 shadow_read_write_fields[j] =
3047 shadow_read_write_fields[i];
3050 max_shadow_read_write_fields = j;
3052 /* shadowed fields guest access without vmexit */
3053 for (i = 0; i < max_shadow_read_write_fields; i++) {
3054 clear_bit(shadow_read_write_fields[i],
3055 vmx_vmwrite_bitmap);
3056 clear_bit(shadow_read_write_fields[i],
3059 for (i = 0; i < max_shadow_read_only_fields; i++)
3060 clear_bit(shadow_read_only_fields[i],
3064 static __init int alloc_kvm_area(void)
3068 for_each_possible_cpu(cpu) {
3071 vmcs = alloc_vmcs_cpu(cpu);
3077 per_cpu(vmxarea, cpu) = vmcs;
3082 static __init int hardware_setup(void)
3084 if (setup_vmcs_config(&vmcs_config) < 0)
3087 if (boot_cpu_has(X86_FEATURE_NX))
3088 kvm_enable_efer_bits(EFER_NX);
3090 if (!cpu_has_vmx_vpid())
3092 if (!cpu_has_vmx_shadow_vmcs())
3093 enable_shadow_vmcs = 0;
3094 if (enable_shadow_vmcs)
3095 init_vmcs_shadow_fields();
3097 if (!cpu_has_vmx_ept() ||
3098 !cpu_has_vmx_ept_4levels()) {
3100 enable_unrestricted_guest = 0;
3101 enable_ept_ad_bits = 0;
3104 if (!cpu_has_vmx_ept_ad_bits())
3105 enable_ept_ad_bits = 0;
3107 if (!cpu_has_vmx_unrestricted_guest())
3108 enable_unrestricted_guest = 0;
3110 if (!cpu_has_vmx_flexpriority())
3111 flexpriority_enabled = 0;
3113 if (!cpu_has_vmx_tpr_shadow())
3114 kvm_x86_ops->update_cr8_intercept = NULL;
3116 if (enable_ept && !cpu_has_vmx_ept_2m_page())
3117 kvm_disable_largepages();
3119 if (!cpu_has_vmx_ple())
3122 if (!cpu_has_vmx_apicv())
3126 kvm_x86_ops->update_cr8_intercept = NULL;
3128 kvm_x86_ops->hwapic_irr_update = NULL;
3129 kvm_x86_ops->deliver_posted_interrupt = NULL;
3130 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
3134 nested_vmx_setup_ctls_msrs();
3136 return alloc_kvm_area();
3139 static __exit void hardware_unsetup(void)
3144 static bool emulation_required(struct kvm_vcpu *vcpu)
3146 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3149 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3150 struct kvm_segment *save)
3152 if (!emulate_invalid_guest_state) {
3154 * CS and SS RPL should be equal during guest entry according
3155 * to VMX spec, but in reality it is not always so. Since vcpu
3156 * is in the middle of the transition from real mode to
3157 * protected mode it is safe to assume that RPL 0 is a good
3160 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3161 save->selector &= ~SELECTOR_RPL_MASK;
3162 save->dpl = save->selector & SELECTOR_RPL_MASK;
3165 vmx_set_segment(vcpu, save, seg);
3168 static void enter_pmode(struct kvm_vcpu *vcpu)
3170 unsigned long flags;
3171 struct vcpu_vmx *vmx = to_vmx(vcpu);
3174 * Update real mode segment cache. It may be not up-to-date if sement
3175 * register was written while vcpu was in a guest mode.
3177 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3178 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3179 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3180 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3181 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3182 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3184 vmx->rmode.vm86_active = 0;
3186 vmx_segment_cache_clear(vmx);
3188 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3190 flags = vmcs_readl(GUEST_RFLAGS);
3191 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3192 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3193 vmcs_writel(GUEST_RFLAGS, flags);
3195 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3196 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3198 update_exception_bitmap(vcpu);
3200 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3201 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3202 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3203 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3204 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3205 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3208 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3210 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3211 struct kvm_segment var = *save;
3214 if (seg == VCPU_SREG_CS)
3217 if (!emulate_invalid_guest_state) {
3218 var.selector = var.base >> 4;
3219 var.base = var.base & 0xffff0;
3229 if (save->base & 0xf)
3230 printk_once(KERN_WARNING "kvm: segment base is not "
3231 "paragraph aligned when entering "
3232 "protected mode (seg=%d)", seg);
3235 vmcs_write16(sf->selector, var.selector);
3236 vmcs_write32(sf->base, var.base);
3237 vmcs_write32(sf->limit, var.limit);
3238 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3241 static void enter_rmode(struct kvm_vcpu *vcpu)
3243 unsigned long flags;
3244 struct vcpu_vmx *vmx = to_vmx(vcpu);
3246 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3247 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3248 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3249 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3250 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3251 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3252 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3254 vmx->rmode.vm86_active = 1;
3257 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3258 * vcpu. Warn the user that an update is overdue.
3260 if (!vcpu->kvm->arch.tss_addr)
3261 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3262 "called before entering vcpu\n");
3264 vmx_segment_cache_clear(vmx);
3266 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
3267 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3268 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3270 flags = vmcs_readl(GUEST_RFLAGS);
3271 vmx->rmode.save_rflags = flags;
3273 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3275 vmcs_writel(GUEST_RFLAGS, flags);
3276 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3277 update_exception_bitmap(vcpu);
3279 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3280 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3281 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3282 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3283 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3284 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3286 kvm_mmu_reset_context(vcpu);
3289 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3291 struct vcpu_vmx *vmx = to_vmx(vcpu);
3292 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3298 * Force kernel_gs_base reloading before EFER changes, as control
3299 * of this msr depends on is_long_mode().
3301 vmx_load_host_state(to_vmx(vcpu));
3302 vcpu->arch.efer = efer;
3303 if (efer & EFER_LMA) {
3304 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3307 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3309 msr->data = efer & ~EFER_LME;
3314 #ifdef CONFIG_X86_64
3316 static void enter_lmode(struct kvm_vcpu *vcpu)
3320 vmx_segment_cache_clear(to_vmx(vcpu));
3322 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3323 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
3324 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3326 vmcs_write32(GUEST_TR_AR_BYTES,
3327 (guest_tr_ar & ~AR_TYPE_MASK)
3328 | AR_TYPE_BUSY_64_TSS);
3330 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3333 static void exit_lmode(struct kvm_vcpu *vcpu)
3335 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3336 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3341 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3343 vpid_sync_context(to_vmx(vcpu));
3345 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3347 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
3351 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3353 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3355 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3356 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3359 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
3361 if (enable_ept && is_paging(vcpu))
3362 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3363 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3366 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3368 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3370 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3371 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3374 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3376 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3378 if (!test_bit(VCPU_EXREG_PDPTR,
3379 (unsigned long *)&vcpu->arch.regs_dirty))
3382 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3383 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3384 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3385 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3386 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3390 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3392 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3394 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3395 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3396 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3397 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3398 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3401 __set_bit(VCPU_EXREG_PDPTR,